From ca40e295018f7df46c7b8a12c3e82b88418322fb Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Fri, 30 Aug 2013 09:52:51 -0700
Subject: fs/minix: Drop dependency on H8300

The H8/300 architecture is gone. Drop dependencies on it.

Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 fs/minix/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig
index 6624684dd5de..f2a0cfcef11d 100644
--- a/fs/minix/Kconfig
+++ b/fs/minix/Kconfig
@@ -18,7 +18,7 @@ config MINIX_FS
 
 config MINIX_FS_NATIVE_ENDIAN
 	def_bool MINIX_FS
-	depends on H8300 || M32R || MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU)
+	depends on M32R || MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU)
 
 config MINIX_FS_BIG_ENDIAN_16BIT_INDEXED
 	def_bool MINIX_FS
-- 
cgit v1.3


From 0d0d110720d7960b77c03c9f2597faaff4b484ae Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 16 Sep 2013 14:52:00 +0200
Subject: GFS2: d_splice_alias() can't return error

unless it was given an IS_ERR(inode), which isn't the case here.  So clean
up the unnecessary error handling in gfs2_create_inode().

This paves the way for real fixes (hence the stable Cc).

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: stable@vger.kernel.org
---
 fs/gfs2/inode.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 64915eeae5a7..6d7f976aa328 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -584,7 +584,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	if (!IS_ERR(inode)) {
 		d = d_splice_alias(inode, dentry);
 		error = 0;
-		if (file && !IS_ERR(d)) {
+		if (file) {
 			if (d == NULL)
 				d = dentry;
 			if (S_ISREG(inode->i_mode))
@@ -593,8 +593,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 				error = finish_no_open(file, d);
 		}
 		gfs2_glock_dq_uninit(ghs);
-		if (IS_ERR(d))
-			return PTR_ERR(d);
 		return error;
 	} else if (error != -ENOENT) {
 		goto fail_gunlock;
-- 
cgit v1.3


From 6aa7640f306a1dd0923d29d9190c5686907d5140 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 11 Sep 2013 13:44:01 -0500
Subject: GFS2: optimize rbm_from_block wrt bi_start

In function gfs2_rbm_from_block, it starts by checking if the block
falls within the first bitmap. It does so by checking if the rbm's
offset is less than (rbm->bi->bi_start + rbm->bi->bi_len) * GFS2_NBBY.
However, the first bitmap will always have bi_start==0. Therefore
this is an unnecessary calculation in a function that gets called
billions of times. This patch removes the reference to bi_start.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 69317435faa7..7a6fa03bb32a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -262,7 +262,7 @@ static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
 	rbm->bi = rbm->rgd->rd_bits;
 	rbm->offset = (u32)(rblock);
 	/* Check if the block is within the first block */
-	if (rbm->offset < (rbm->bi->bi_start + rbm->bi->bi_len) * GFS2_NBBY)
+	if (rbm->offset < rbm->bi->bi_len * GFS2_NBBY)
 		return 0;
 
 	/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
-- 
cgit v1.3


From 7e230f5774336c5c180d8aeab0e2cb1326c73fa3 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 11 Sep 2013 13:44:02 -0500
Subject: GFS2: introduce bi_blocks for optimization

This patch introduces a new field in the bitmap structure called
bi_blocks. Its purpose is to save us from constantly multiplying
bi_len by the constant GFS2_NBBY. It also paves the way for more
optimization in a future patch.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h | 1 +
 fs/gfs2/rgrp.c   | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 26aabd7caba7..f1a3243dfaf2 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -71,6 +71,7 @@ struct gfs2_bitmap {
 	u32 bi_offset;
 	u32 bi_start;
 	u32 bi_len;
+	u32 bi_blocks;
 };
 
 struct gfs2_rgrpd {
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 7a6fa03bb32a..7d64a27683d9 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -262,7 +262,7 @@ static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
 	rbm->bi = rbm->rgd->rd_bits;
 	rbm->offset = (u32)(rblock);
 	/* Check if the block is within the first block */
-	if (rbm->offset < rbm->bi->bi_len * GFS2_NBBY)
+	if (rbm->offset < rbm->bi->bi_blocks)
 		return 0;
 
 	/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
@@ -743,18 +743,21 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
 			bi->bi_offset = sizeof(struct gfs2_rgrp);
 			bi->bi_start = 0;
 			bi->bi_len = bytes;
+			bi->bi_blocks = bytes * GFS2_NBBY;
 		/* header block */
 		} else if (x == 0) {
 			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
 			bi->bi_offset = sizeof(struct gfs2_rgrp);
 			bi->bi_start = 0;
 			bi->bi_len = bytes;
+			bi->bi_blocks = bytes * GFS2_NBBY;
 		/* last block */
 		} else if (x + 1 == length) {
 			bytes = bytes_left;
 			bi->bi_offset = sizeof(struct gfs2_meta_header);
 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
 			bi->bi_len = bytes;
+			bi->bi_blocks = bytes * GFS2_NBBY;
 		/* other blocks */
 		} else {
 			bytes = sdp->sd_sb.sb_bsize -
@@ -762,6 +765,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
 			bi->bi_offset = sizeof(struct gfs2_meta_header);
 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
 			bi->bi_len = bytes;
+			bi->bi_blocks = bytes * GFS2_NBBY;
 		}
 
 		bytes_left -= bytes;
-- 
cgit v1.3


From b8708905199a85eebbd820f98d18e045c32077bf Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 11 Sep 2013 13:44:05 -0500
Subject: GFS2: Do not reset flags on active reservations

When we used try locks for rgrps on block allocations, it was important
to clear the flags field so that we used a blocking hold on the glock.
Now that we're not doing try locks, clearing flags is unnecessary, and
a waste of time. In fact, it's probably doing the wrong thing because
it clears the GL_SKIP bit that was set for the lvb tracking purposes.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 7d64a27683d9..8e5003820aa7 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1825,7 +1825,6 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags)
 		return -EINVAL;
 	if (gfs2_rs_active(rs)) {
 		begin = rs->rs_rbm.rgd;
-		flags = 0; /* Yoda: Do or do not. There is no try */
 	} else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
 		rs->rs_rbm.rgd = begin = ip->i_rgd;
 	} else {
-- 
cgit v1.3


From e579ed4f446e64748a2d26eed8f8b28f728495bd Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 17 Sep 2013 13:12:15 -0400
Subject: GFS2: Introduce rbm field bii

This is a respin of the original patch. As Steve pointed out, the
introduction of field bii makes it easy to eliminate bi itself.
This revised patch does just that, replacing bi with bii.

This patch adds a new field to the rbm structure, called bii,
which is an index into the array of bitmaps for an rgrp.
This replaces *bi which was a pointer to the bitmap.
This is being done for further optimizations.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h |  12 +++++--
 fs/gfs2/rgrp.c   | 107 +++++++++++++++++++++++++++++--------------------------
 2 files changed, 65 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index f1a3243dfaf2..8c8f110d8e35 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -102,19 +102,25 @@ struct gfs2_rgrpd {
 
 struct gfs2_rbm {
 	struct gfs2_rgrpd *rgd;
-	struct gfs2_bitmap *bi;	/* Bitmap must belong to the rgd */
 	u32 offset;		/* The offset is bitmap relative */
+	int bii;		/* Bitmap index */
 };
 
+static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm)
+{
+	return rbm->rgd->rd_bits + rbm->bii;
+}
+
 static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm)
 {
-	return rbm->rgd->rd_data0 + (rbm->bi->bi_start * GFS2_NBBY) + rbm->offset;
+	return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) +
+		rbm->offset;
 }
 
 static inline bool gfs2_rbm_eq(const struct gfs2_rbm *rbm1,
 			       const struct gfs2_rbm *rbm2)
 {
-	return (rbm1->rgd == rbm2->rgd) && (rbm1->bi == rbm2->bi) && 
+	return (rbm1->rgd == rbm2->rgd) && (rbm1->bii == rbm2->bii) &&
 	       (rbm1->offset == rbm2->offset);
 }
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8e5003820aa7..dd3c4d3d7f41 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -81,11 +81,12 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
 			       unsigned char new_state)
 {
 	unsigned char *byte1, *byte2, *end, cur_state;
-	unsigned int buflen = rbm->bi->bi_len;
+	struct gfs2_bitmap *bi = rbm_bi(rbm);
+	unsigned int buflen = bi->bi_len;
 	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
 
-	byte1 = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset + (rbm->offset / GFS2_NBBY);
-	end = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset + buflen;
+	byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
+	end = bi->bi_bh->b_data + bi->bi_offset + buflen;
 
 	BUG_ON(byte1 >= end);
 
@@ -95,18 +96,17 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
 		printk(KERN_WARNING "GFS2: buf_blk = 0x%x old_state=%d, "
 		       "new_state=%d\n", rbm->offset, cur_state, new_state);
 		printk(KERN_WARNING "GFS2: rgrp=0x%llx bi_start=0x%x\n",
-		       (unsigned long long)rbm->rgd->rd_addr,
-		       rbm->bi->bi_start);
+		       (unsigned long long)rbm->rgd->rd_addr, bi->bi_start);
 		printk(KERN_WARNING "GFS2: bi_offset=0x%x bi_len=0x%x\n",
-		       rbm->bi->bi_offset, rbm->bi->bi_len);
+		       bi->bi_offset, bi->bi_len);
 		dump_stack();
 		gfs2_consist_rgrpd(rbm->rgd);
 		return;
 	}
 	*byte1 ^= (cur_state ^ new_state) << bit;
 
-	if (do_clone && rbm->bi->bi_clone) {
-		byte2 = rbm->bi->bi_clone + rbm->bi->bi_offset + (rbm->offset / GFS2_NBBY);
+	if (do_clone && bi->bi_clone) {
+		byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
 		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
 		*byte2 ^= (cur_state ^ new_state) << bit;
 	}
@@ -121,7 +121,8 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
 
 static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm)
 {
-	const u8 *buffer = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset;
+	struct gfs2_bitmap *bi = rbm_bi(rbm);
+	const u8 *buffer = bi->bi_bh->b_data + bi->bi_offset;
 	const u8 *byte;
 	unsigned int bit;
 
@@ -252,25 +253,23 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
 static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
 {
 	u64 rblock = block - rbm->rgd->rd_data0;
-	u32 x;
 
 	if (WARN_ON_ONCE(rblock > UINT_MAX))
 		return -EINVAL;
 	if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data)
 		return -E2BIG;
 
-	rbm->bi = rbm->rgd->rd_bits;
+	rbm->bii = 0;
 	rbm->offset = (u32)(rblock);
 	/* Check if the block is within the first block */
-	if (rbm->offset < rbm->bi->bi_blocks)
+	if (rbm->offset < rbm_bi(rbm)->bi_blocks)
 		return 0;
 
 	/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
 	rbm->offset += (sizeof(struct gfs2_rgrp) -
 			sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
-	x = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
-	rbm->offset -= x * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
-	rbm->bi += x;
+	rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
+	rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
 	return 0;
 }
 
@@ -328,6 +327,7 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
 	u32 chunk_size;
 	u8 *ptr, *start, *end;
 	u64 block;
+	struct gfs2_bitmap *bi;
 
 	if (n_unaligned &&
 	    gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
@@ -336,11 +336,12 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
 	n_unaligned = len & 3;
 	/* Start is now byte aligned */
 	while (len > 3) {
-		start = rbm.bi->bi_bh->b_data;
-		if (rbm.bi->bi_clone)
-			start = rbm.bi->bi_clone;
-		end = start + rbm.bi->bi_bh->b_size;
-		start += rbm.bi->bi_offset;
+		bi = rbm_bi(&rbm);
+		start = bi->bi_bh->b_data;
+		if (bi->bi_clone)
+			start = bi->bi_clone;
+		end = start + bi->bi_bh->b_size;
+		start += bi->bi_offset;
 		BUG_ON(rbm.offset & 3);
 		start += (rbm.offset / GFS2_NBBY);
 		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
@@ -605,11 +606,13 @@ static void __rs_deltree(struct gfs2_blkreserv *rs)
 	RB_CLEAR_NODE(&rs->rs_node);
 
 	if (rs->rs_free) {
+		struct gfs2_bitmap *bi = rbm_bi(&rs->rs_rbm);
+
 		/* return reserved blocks to the rgrp */
 		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
 		rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
 		rs->rs_free = 0;
-		clear_bit(GBF_FULL, &rs->rs_rbm.bi->bi_flags);
+		clear_bit(GBF_FULL, &bi->bi_flags);
 		smp_mb__after_clear_bit();
 	}
 }
@@ -1558,14 +1561,14 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
 			 const struct gfs2_inode *ip, bool nowrap)
 {
 	struct buffer_head *bh;
-	struct gfs2_bitmap *initial_bi;
+	int initial_bii;
 	u32 initial_offset;
 	u32 offset;
 	u8 *buffer;
-	int index;
 	int n = 0;
 	int iters = rbm->rgd->rd_length;
 	int ret;
+	struct gfs2_bitmap *bi;
 
 	/* If we are not starting at the beginning of a bitmap, then we
 	 * need to add one to the bitmap count to ensure that we search
@@ -1575,52 +1578,53 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
 		iters++;
 
 	while(1) {
-		if (test_bit(GBF_FULL, &rbm->bi->bi_flags) &&
+		bi = rbm_bi(rbm);
+		if (test_bit(GBF_FULL, &bi->bi_flags) &&
 		    (state == GFS2_BLKST_FREE))
 			goto next_bitmap;
 
-		bh = rbm->bi->bi_bh;
-		buffer = bh->b_data + rbm->bi->bi_offset;
+		bh = bi->bi_bh;
+		buffer = bh->b_data + bi->bi_offset;
 		WARN_ON(!buffer_uptodate(bh));
-		if (state != GFS2_BLKST_UNLINKED && rbm->bi->bi_clone)
-			buffer = rbm->bi->bi_clone + rbm->bi->bi_offset;
+		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
+			buffer = bi->bi_clone + bi->bi_offset;
 		initial_offset = rbm->offset;
-		offset = gfs2_bitfit(buffer, rbm->bi->bi_len, rbm->offset, state);
+		offset = gfs2_bitfit(buffer, bi->bi_len, rbm->offset, state);
 		if (offset == BFITNOENT)
 			goto bitmap_full;
 		rbm->offset = offset;
 		if (ip == NULL)
 			return 0;
 
-		initial_bi = rbm->bi;
+		initial_bii = rbm->bii;
 		ret = gfs2_reservation_check_and_update(rbm, ip, minext);
 		if (ret == 0)
 			return 0;
 		if (ret > 0) {
-			n += (rbm->bi - initial_bi);
+			n += (rbm->bii - initial_bii);
 			goto next_iter;
 		}
 		if (ret == -E2BIG) {
-			index = 0;
+			rbm->bii = 0;
 			rbm->offset = 0;
-			n += (rbm->bi - initial_bi);
+			n += (rbm->bii - initial_bii);
 			goto res_covered_end_of_rgrp;
 		}
 		return ret;
 
 bitmap_full:	/* Mark bitmap as full and fall through */
-		if ((state == GFS2_BLKST_FREE) && initial_offset == 0)
-			set_bit(GBF_FULL, &rbm->bi->bi_flags);
+		if ((state == GFS2_BLKST_FREE) && initial_offset == 0) {
+			struct gfs2_bitmap *bi = rbm_bi(rbm);
+			set_bit(GBF_FULL, &bi->bi_flags);
+		}
 
 next_bitmap:	/* Find next bitmap in the rgrp */
 		rbm->offset = 0;
-		index = rbm->bi - rbm->rgd->rd_bits;
-		index++;
-		if (index == rbm->rgd->rd_length)
-			index = 0;
+		rbm->bii++;
+		if (rbm->bii == rbm->rgd->rd_length)
+			rbm->bii = 0;
 res_covered_end_of_rgrp:
-		rbm->bi = &rbm->rgd->rd_bits[index];
-		if ((index == 0) && nowrap)
+		if ((rbm->bii == 0) && nowrap)
 			break;
 		n++;
 next_iter:
@@ -1649,7 +1653,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
 	struct gfs2_inode *ip;
 	int error;
 	int found = 0;
-	struct gfs2_rbm rbm = { .rgd = rgd, .bi = rgd->rd_bits, .offset = 0 };
+	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
 
 	while (1) {
 		down_write(&sdp->sd_log_flush_lock);
@@ -1976,14 +1980,14 @@ static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
 
 	*n = 1;
 	block = gfs2_rbm_to_block(rbm);
-	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm->bi->bi_bh);
+	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
 	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
 	block++;
 	while (*n < elen) {
 		ret = gfs2_rbm_from_block(&pos, block);
 		if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE)
 			break;
-		gfs2_trans_add_meta(pos.rgd->rd_gl, pos.bi->bi_bh);
+		gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
 		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
 		(*n)++;
 		block++;
@@ -2004,6 +2008,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 				     u32 blen, unsigned char new_state)
 {
 	struct gfs2_rbm rbm;
+	struct gfs2_bitmap *bi;
 
 	rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1);
 	if (!rbm.rgd) {
@@ -2014,15 +2019,15 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 
 	while (blen--) {
 		gfs2_rbm_from_block(&rbm, bstart);
+		bi = rbm_bi(&rbm);
 		bstart++;
-		if (!rbm.bi->bi_clone) {
-			rbm.bi->bi_clone = kmalloc(rbm.bi->bi_bh->b_size,
-						   GFP_NOFS | __GFP_NOFAIL);
-			memcpy(rbm.bi->bi_clone + rbm.bi->bi_offset,
-			       rbm.bi->bi_bh->b_data + rbm.bi->bi_offset,
-			       rbm.bi->bi_len);
+		if (!bi->bi_clone) {
+			bi->bi_clone = kmalloc(bi->bi_bh->b_size,
+					       GFP_NOFS | __GFP_NOFAIL);
+			memcpy(bi->bi_clone + bi->bi_offset,
+			       bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
 		}
-		gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.bi->bi_bh);
+		gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
 		gfs2_setbit(&rbm, false, new_state);
 	}
 
-- 
cgit v1.3


From 149ed7f51e279916e7a7a1eef5cec50da67d7cfe Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 17 Sep 2013 13:14:35 -0400
Subject: GFS2: new function gfs2_rbm_incr

Since the previous patch eliminated bi in favor of bii, this follow-on
patch needed to be adjusted accordingly. Here is the revised version.

This patch adds a new function, gfs2_rbm_incr, which increments
an rbm structure. This is more efficient than calling gfs2_rbm_to_block,
incrementing, then calling gfs2_rbm_from_block.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index dd3c4d3d7f41..285dd363199a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -273,6 +273,32 @@ static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
 	return 0;
 }
 
+/**
+ * gfs2_rbm_incr - increment an rbm structure
+ * @rbm: The rbm with rgd already set correctly
+ *
+ * This function takes an existing rbm structure and increments it to the next
+ * viable block offset.
+ *
+ * Returns: If incrementing the offset would cause the rbm to go past the
+ *          end of the rgrp, true is returned, otherwise false.
+ *
+ */
+
+static bool gfs2_rbm_incr(struct gfs2_rbm *rbm)
+{
+	if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */
+		rbm->offset++;
+		return false;
+	}
+	if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */
+		return true;
+
+	rbm->offset = 0;
+	rbm->bii++;
+	return false;
+}
+
 /**
  * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
  * @rbm: Position to search (value/result)
@@ -284,7 +310,6 @@ static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
 
 static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
 {
-	u64 block;
 	u32 n;
 	u8 res;
 
@@ -295,8 +320,7 @@ static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *le
 		(*len)--;
 		if (*len == 0)
 			return true;
-		block = gfs2_rbm_to_block(rbm);
-		if (gfs2_rbm_from_block(rbm, block + 1))
+		if (gfs2_rbm_incr(rbm))
 			return true;
 	}
 
-- 
cgit v1.3


From 5ca1db41ecdeb0358b968265fadb755213558a85 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Mon, 23 Sep 2013 13:21:04 +0100
Subject: GFS2: fix dentry leaks

We need to dput() the result of d_splice_alias(), unless it is passed to
finish_no_open().

Edited by Steven Whitehouse in order to make it apply to the current
GFS2 git tree, and taking account of a prerequisite patch which hasn't
been applied.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: stable@vger.kernel.org
---
 fs/gfs2/inode.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6d7f976aa328..cd58611912f5 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -585,12 +585,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 		d = d_splice_alias(inode, dentry);
 		error = 0;
 		if (file) {
-			if (d == NULL)
-				d = dentry;
-			if (S_ISREG(inode->i_mode))
-				error = finish_open(file, d, gfs2_open_common, opened);
-			else
+			if (S_ISREG(inode->i_mode)) {
+				WARN_ON(d != NULL);
+				error = finish_open(file, dentry, gfs2_open_common, opened);
+			} else {
 				error = finish_no_open(file, d);
+			}
+		} else {
+			dput(d);
 		}
 		gfs2_glock_dq_uninit(ghs);
 		return error;
@@ -777,8 +779,10 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
 		error = finish_open(file, dentry, gfs2_open_common, opened);
 
 	gfs2_glock_dq_uninit(&gh);
-	if (error)
+	if (error) {
+		dput(d);
 		return ERR_PTR(error);
+	}
 	return d;
 }
 
@@ -1159,14 +1163,16 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
 	d = __gfs2_lookup(dir, dentry, file, opened);
 	if (IS_ERR(d))
 		return PTR_ERR(d);
-	if (d == NULL)
-		d = dentry;
-	if (d->d_inode) {
+	if (d != NULL)
+		dentry = d;
+	if (dentry->d_inode) {
 		if (!(*opened & FILE_OPENED))
-			return finish_no_open(file, d);
+			return finish_no_open(file, dentry);
+		dput(d);
 		return 0;
 	}
 
+	BUG_ON(d != NULL);
 	if (!(flags & O_CREAT))
 		return -ENOENT;
 
-- 
cgit v1.3


From e76eebee70948060d80b5f9edd2d97062049df53 Mon Sep 17 00:00:00 2001
From: Yu Chao <chao2.yu@samsung.com>
Date: Thu, 12 Sep 2013 11:17:51 +0800
Subject: f2fs: optimize fs_lock for better performance

There is a performance problem: when all sbi->fs_lock are holded, then
all the following threads may get the same next_lock value from sbi->next_lock_num
in function mutex_lock_op, and wait for the same lock(fs_lock[next_lock]),
it may cause performance reduce.
So we move the sbi->next_lock_num++ before getting lock, this will average the
following threads if all sbi->fs_lock are holded.

v1-->v2:
	Drop the needless spin_lock as Jaegeuk suggested.

Suggested-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Signed-off-by: Yu Chao <chao2.yu@samsung.com>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/f2fs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 608f0df5b919..7fd99d8bd2ff 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -544,15 +544,15 @@ static inline void mutex_unlock_all(struct f2fs_sb_info *sbi)
 
 static inline int mutex_lock_op(struct f2fs_sb_info *sbi)
 {
-	unsigned char next_lock = sbi->next_lock_num % NR_GLOBAL_LOCKS;
+	unsigned char next_lock;
 	int i = 0;
 
 	for (; i < NR_GLOBAL_LOCKS; i++)
 		if (mutex_trylock(&sbi->fs_lock[i]))
 			return i;
 
+	next_lock = sbi->next_lock_num++ % NR_GLOBAL_LOCKS;
 	mutex_lock(&sbi->fs_lock[next_lock]);
-	sbi->next_lock_num++;
 	return next_lock;
 }
 
-- 
cgit v1.3


From a57e564d14d9d123b2dd4ff1c933da0d900e0b1d Mon Sep 17 00:00:00 2001
From: Jin Xu <jinuxstyle@gmail.com>
Date: Fri, 13 Sep 2013 08:38:54 +0800
Subject: f2fs: optimize the victim searching loop slightly

Since the MAX_VICTIM_SEARCH has been enlarged from 20 to 4096,
the victim searching overhead will be increased much than before,
especially for SSR that searches victim for use quiet often.
This patch intends to reduce the overhead a little bit by:
- make the get_gc_cost a inline routine to reduce function call
  overhead
- reduce multiplication and division operations
- reduce unnecessary comparison operation

Signed-off-by: Jin Xu <jinuxstyle@gmail.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/gc.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 2f157e883687..fbad96846c7c 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -236,8 +236,8 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
 	return UINT_MAX - ((100 * (100 - u) * age) / (100 + u));
 }
 
-static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno,
-					struct victim_sel_policy *p)
+static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
+			unsigned int segno, struct victim_sel_policy *p)
 {
 	if (p->alloc_mode == SSR)
 		return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
@@ -293,7 +293,11 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 			}
 			break;
 		}
-		p.offset = ((segno / p.ofs_unit) * p.ofs_unit) + p.ofs_unit;
+
+		p.offset = segno + p.ofs_unit;
+		if (p.ofs_unit > 1)
+			p.offset -= segno % p.ofs_unit;
+
 		secno = GET_SECNO(sbi, segno);
 
 		if (sec_usage_check(sbi, secno))
@@ -306,10 +310,9 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 		if (p.min_cost > cost) {
 			p.min_segno = segno;
 			p.min_cost = cost;
-		}
-
-		if (cost == max_cost)
+		} else if (unlikely(cost == max_cost)) {
 			continue;
+		}
 
 		if (nsearched++ >= p.max_search) {
 			sbi->last_victim[p.gc_mode] = segno;
-- 
cgit v1.3


From cc7b1bb173676621b092b61d22d8d12b05efb5e8 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Sun, 22 Sep 2013 15:50:50 +0800
Subject: f2fs: avoid allocating failure in bio_alloc

This patch add macro MAX_BIO_BLOCKS to limit value of npages in
f2fs_bio_alloc, it can avoid allocating failure in bio_alloc caused by
npages is larger than BIO_MAX_PAGES.

Signed-off-by: Yu Chao <chao2.yu@samsung.com>
Reviewed-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/segment.c | 4 +++-
 fs/f2fs/segment.h | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 09af9c7b0f52..bd79bbeceb1d 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -657,6 +657,7 @@ static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
 				block_t blk_addr, enum page_type type)
 {
 	struct block_device *bdev = sbi->sb->s_bdev;
+	int bio_blocks;
 
 	verify_block_addr(sbi, blk_addr);
 
@@ -676,7 +677,8 @@ retry:
 			goto retry;
 		}
 
-		sbi->bio[type] = f2fs_bio_alloc(bdev, max_hw_blocks(sbi));
+		bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+		sbi->bio[type] = f2fs_bio_alloc(bdev, bio_blocks);
 		sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
 		sbi->bio[type]->bi_private = priv;
 		/*
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index bdd10eab8c40..7f94d78cda3d 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -90,6 +90,8 @@
 	(blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE))
 #define SECTOR_TO_BLOCK(sbi, sectors)					\
 	(sectors >> ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE))
+#define MAX_BIO_BLOCKS(max_hw_blocks)					\
+	(min((int)max_hw_blocks, BIO_MAX_PAGES))
 
 /* during checkpoint, bio_private is used to synchronize the last bio */
 struct bio_private {
-- 
cgit v1.3


From 691c6fd2a2d6b6db08b17beec5e42ab0687058c7 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Tue, 24 Sep 2013 09:26:24 +0800
Subject: f2fs: remove unneeded write checkpoint in recover_fsync_data

Previously, recover_fsync_data still to write checkpoint when there is
nothing to recover with normal umount image.
It may reduce mount performance and flash memory lifetime, so let's remove
it.

Signed-off-by: Tan Shu <shu.tan@samsung.com>
Signed-off-by: Yu Chao <chao2.yu@samsung.com>
Reviewed-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/recovery.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 51ef5eec33d7..d43e4cd1f815 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -419,6 +419,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
 {
 	struct list_head inode_list;
 	int err;
+	int need_writecp = 0;
 
 	fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
 			sizeof(struct fsync_inode_entry), NULL);
@@ -436,6 +437,8 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
 	if (list_empty(&inode_list))
 		goto out;
 
+	need_writecp = 1;
+
 	/* step #2: recover data */
 	err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
 	BUG_ON(!list_empty(&inode_list));
@@ -443,7 +446,7 @@ out:
 	destroy_fsync_dnodes(&inode_list);
 	kmem_cache_destroy(fsync_entry_slab);
 	sbi->por_doing = 0;
-	if (!err)
+	if (!err && need_writecp)
 		write_checkpoint(sbi, false);
 	return err;
 }
-- 
cgit v1.3


From 885166c03c1d0ea6d79d707229340e3161ed1316 Mon Sep 17 00:00:00 2001
From: "Russ W. Knize" <russ@knizefamily.net>
Date: Tue, 24 Sep 2013 09:35:40 -0500
Subject: f2fs: don't let the orphan inode counter underflow

Accounting errors from buggy code calling the acquire/release/remove
orphan inode interfaces can cause n_orphans to underflow, which will
then cause acquire_orphan_inode() to return -ENOSPC on the next
operation.  This commit guards against that condition.

Signed-off-by: Russ Knize <rknize@motorola.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index bb312201ca95..ca3944240db5 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -206,6 +206,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi)
 void release_orphan_inode(struct f2fs_sb_info *sbi)
 {
 	mutex_lock(&sbi->orphan_inode_mutex);
+	BUG_ON(sbi->n_orphans == 0);
 	sbi->n_orphans--;
 	mutex_unlock(&sbi->orphan_inode_mutex);
 }
@@ -253,6 +254,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 		if (orphan->ino == ino) {
 			list_del(&orphan->list);
 			kmem_cache_free(orphan_entry_slab, orphan);
+			BUG_ON(sbi->n_orphans == 0);
 			sbi->n_orphans--;
 			break;
 		}
-- 
cgit v1.3


From 52ab956000214aa111254e52e5eac3d91bd7cf4b Mon Sep 17 00:00:00 2001
From: Russ Knize <Russ.Knize@motorola.com>
Date: Tue, 24 Sep 2013 15:49:23 -0500
Subject: f2fs: don't GC or take an fs_lock from f2fs_initxattrs()

f2fs_initxattrs() is called internally from within F2FS and should
not call functions that are used by VFS handlers.  This avoids
certain deadlocks:

- vfs_create()
 - f2fs_create() <-- takes an fs_lock
  - f2fs_add_link()
   - __f2fs_add_link()
    - init_inode_metadata()
     - f2fs_init_security()
      - security_inode_init_security()
       - f2fs_initxattrs()
        - f2fs_setxattr() <-- also takes an fs_lock

If the caller happens to grab the same fs_lock from the pool in both
places, they will deadlock.  There are also deadlocks involving
multiple threads and mutexes:

- f2fs_write_begin()
 - f2fs_balance_fs() <-- takes gc_mutex
  - f2fs_gc()
   - write_checkpoint()
    - block_operations()
     - mutex_lock_all() <-- blocks trying to grab all fs_locks

- f2fs_mkdir() <-- takes an fs_lock
 - __f2fs_add_link()
  - f2fs_init_security()
   - security_inode_init_security()
    - f2fs_initxattrs()
     - f2fs_setxattr()
      - f2fs_balance_fs() <-- blocks trying to take gc_mutex

Signed-off-by: Russ Knize <Russ.Knize@motorola.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/xattr.c | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 1ac8a5f6e380..3d900ea34e47 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -154,6 +154,9 @@ static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
 }
 
 #ifdef CONFIG_F2FS_FS_SECURITY
+static int __f2fs_setxattr(struct inode *inode, int name_index,
+			const char *name, const void *value, size_t value_len,
+			struct page *ipage);
 static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
 		void *page)
 {
@@ -161,7 +164,7 @@ static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
 	int err = 0;
 
 	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
-		err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY,
+		err = __f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY,
 				xattr->name, xattr->value,
 				xattr->value_len, (struct page *)page);
 		if (err < 0)
@@ -469,16 +472,15 @@ cleanup:
 	return error;
 }
 
-int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
-			const void *value, size_t value_len, struct page *ipage)
+static int __f2fs_setxattr(struct inode *inode, int name_index,
+			const char *name, const void *value, size_t value_len,
+			struct page *ipage)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 	struct f2fs_xattr_entry *here, *last;
 	void *base_addr;
 	int found, newsize;
 	size_t name_len;
-	int ilock;
 	__u32 new_hsize;
 	int error = -ENOMEM;
 
@@ -493,10 +495,6 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
 	if (name_len > F2FS_NAME_LEN || value_len > MAX_VALUE_LEN(inode))
 		return -ERANGE;
 
-	f2fs_balance_fs(sbi);
-
-	ilock = mutex_lock_op(sbi);
-
 	base_addr = read_all_xattrs(inode, ipage);
 	if (!base_addr)
 		goto exit;
@@ -578,7 +576,24 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
 	else
 		update_inode_page(inode);
 exit:
-	mutex_unlock_op(sbi, ilock);
 	kzfree(base_addr);
 	return error;
 }
+
+int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
+			const void *value, size_t value_len, struct page *ipage)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	int ilock;
+	int err;
+
+	f2fs_balance_fs(sbi);
+
+	ilock = mutex_lock_op(sbi);
+
+	err = __f2fs_setxattr(inode, name_index, name, value, value_len, ipage);
+
+	mutex_unlock_op(sbi, ilock);
+
+	return err;
+}
-- 
cgit v1.3


From 2e5558f4a5cf16a7394fd5770087303db8912c66 Mon Sep 17 00:00:00 2001
From: "Russ W. Knize" <rknize@gmail.com>
Date: Tue, 24 Sep 2013 09:40:57 -0500
Subject: f2fs: account for orphan inodes during recovery

During recovery, orphan inodes are deleted via truncate_hole().
These orphans are added by recover_dentry() via f2fs_delete_entry().
However, f2fs_delete_entry() adds them via add_orphan_inode()
without calling acquire_orphan_inode() first.  This prevents the
counters from being incremented properly, which causes them to
underflow when remove_orphan_inode() is called later on.

Signed-off-by: Russ Knize <rknize@motorola.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/recovery.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index d43e4cd1f815..a15d122fdc50 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -64,24 +64,31 @@ static int recover_dentry(struct page *ipage, struct inode *inode)
 	name.name = raw_inode->i_name;
 retry:
 	de = f2fs_find_entry(dir, &name, &page);
-	if (de && inode->i_ino == le32_to_cpu(de->ino)) {
-		kunmap(page);
-		f2fs_put_page(page, 0);
-		goto out;
-	}
+	if (de && inode->i_ino == le32_to_cpu(de->ino))
+		goto out_unmap_put;
 	if (de) {
 		einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
 		if (IS_ERR(einode)) {
 			WARN_ON(1);
 			if (PTR_ERR(einode) == -ENOENT)
 				err = -EEXIST;
-			goto out;
+			goto out_unmap_put;
+		}
+		err = acquire_orphan_inode(F2FS_SB(inode->i_sb));
+		if (err) {
+			iput(einode);
+			goto out_unmap_put;
 		}
 		f2fs_delete_entry(de, page, einode);
 		iput(einode);
 		goto retry;
 	}
 	err = __f2fs_add_link(dir, &name, inode);
+	goto out;
+
+out_unmap_put:
+	kunmap(page);
+	f2fs_put_page(page, 0);
 out:
 	f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: "
 			"ino = %x, name = %s, dir = %lx, err = %d",
-- 
cgit v1.3


From bcac3769ca6d6278f93afb6cc2b234d260ee5951 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 11 Sep 2013 22:29:03 -0400
Subject: sysfs: drop semicolon from to_sysfs_dirent() definition

The expansion of to_sysfs_dirent() contains an unncessary trailing
semicolon making it impossible to use in the middle of statements.
Drop it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 4d83cedb9fcb..834c64cb7f88 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -28,7 +28,7 @@
 DEFINE_MUTEX(sysfs_mutex);
 DEFINE_SPINLOCK(sysfs_assoc_lock);
 
-#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb);
+#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb)
 
 static DEFINE_SPINLOCK(sysfs_ino_lock);
 static DEFINE_IDA(sysfs_ino_ida);
-- 
cgit v1.3


From 58292cbe6669d74498a5f08db13e57cb3bcfb81d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 11 Sep 2013 22:29:04 -0400
Subject: sysfs: make attr namespace interface less convoluted

sysfs ns (namespace) implementation became more convoluted than
necessary while trying to hide ns information from visible interface.
The relatively recent attr ns support is a good example.

* attr ns tag is determined by sysfs_ops->namespace() callback while
  dir tag is determined by kobj_type->namespace().  The placement is
  arbitrary.

* Instead of performing operations with explicit ns tag, the namespace
  callback is routed through sysfs_attr_ns(), sysfs_ops->namespace(),
  class_attr_namespace(), class_attr->namespace().  It's not simpler
  in any sense.  The only thing this convolution does is traversing
  the whole stack backwards.

The namespace callbacks are unncessary because the operations involved
are inherently synchronous.  The information can be provided in in
straight-forward top-down direction and reversing that direction is
unnecessary and against basic design principles.

This backward interface is unnecessarily convoluted and hinders
properly separating out sysfs from driver model / kobject for proper
layering.  This patch updates attr ns support such that

* sysfs_ops->namespace() and class_attr->namespace() are dropped.

* sysfs_{create|remove}_file_ns(), which take explicit @ns param, are
  added and sysfs_{create|remove}_file() are now simple wrappers
  around the ns aware functions.

* ns handling is dropped from sysfs_chmod_file().  Nobody uses it at
  this point.  sysfs_chmod_file_ns() can be added later if necessary.

* Explicit @ns is propagated through class_{create|remove}_file_ns()
  and netdev_class_{create|remove}_file_ns().

* driver/net/bonding which is currently the only user of attr
  namespace is updated to use netdev_class_{create|remove}_file_ns()
  with @bh->net as the ns tag instead of using the namespace callback.

This patch should be an equivalent conversion without any functional
difference.  It makes the code easier to follow, reduces lines of code
a bit and helps proper separation and layering.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Kay Sievers <kay@vrfy.org>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/class.c             | 29 ++++--------
 drivers/net/bonding/bond_sysfs.c | 14 ++----
 fs/sysfs/file.c                  | 95 ++++++++++------------------------------
 fs/sysfs/group.c                 |  7 +--
 fs/sysfs/sysfs.h                 |  5 ++-
 include/linux/device.h           | 24 +++++++---
 include/linux/netdevice.h        | 16 ++++++-
 include/linux/sysfs.h            | 31 +++++++++----
 net/core/net-sysfs.c             | 14 +++---
 9 files changed, 106 insertions(+), 129 deletions(-)

(limited to 'fs')

diff --git a/drivers/base/class.c b/drivers/base/class.c
index 8b7818b80056..f96f70419a78 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -47,18 +47,6 @@ static ssize_t class_attr_store(struct kobject *kobj, struct attribute *attr,
 	return ret;
 }
 
-static const void *class_attr_namespace(struct kobject *kobj,
-					const struct attribute *attr)
-{
-	struct class_attribute *class_attr = to_class_attr(attr);
-	struct subsys_private *cp = to_subsys_private(kobj);
-	const void *ns = NULL;
-
-	if (class_attr->namespace)
-		ns = class_attr->namespace(cp->class, class_attr);
-	return ns;
-}
-
 static void class_release(struct kobject *kobj)
 {
 	struct subsys_private *cp = to_subsys_private(kobj);
@@ -86,7 +74,6 @@ static const struct kobj_ns_type_operations *class_child_ns_type(struct kobject
 static const struct sysfs_ops class_sysfs_ops = {
 	.show	   = class_attr_show,
 	.store	   = class_attr_store,
-	.namespace = class_attr_namespace,
 };
 
 static struct kobj_type class_ktype = {
@@ -99,21 +86,23 @@ static struct kobj_type class_ktype = {
 static struct kset *class_kset;
 
 
-int class_create_file(struct class *cls, const struct class_attribute *attr)
+int class_create_file_ns(struct class *cls, const struct class_attribute *attr,
+			 const void *ns)
 {
 	int error;
 	if (cls)
-		error = sysfs_create_file(&cls->p->subsys.kobj,
-					  &attr->attr);
+		error = sysfs_create_file_ns(&cls->p->subsys.kobj,
+					     &attr->attr, ns);
 	else
 		error = -EINVAL;
 	return error;
 }
 
-void class_remove_file(struct class *cls, const struct class_attribute *attr)
+void class_remove_file_ns(struct class *cls, const struct class_attribute *attr,
+			  const void *ns)
 {
 	if (cls)
-		sysfs_remove_file(&cls->p->subsys.kobj, &attr->attr);
+		sysfs_remove_file_ns(&cls->p->subsys.kobj, &attr->attr, ns);
 }
 
 static struct class *class_get(struct class *cls)
@@ -600,8 +589,8 @@ int __init classes_init(void)
 	return 0;
 }
 
-EXPORT_SYMBOL_GPL(class_create_file);
-EXPORT_SYMBOL_GPL(class_remove_file);
+EXPORT_SYMBOL_GPL(class_create_file_ns);
+EXPORT_SYMBOL_GPL(class_remove_file_ns);
 EXPORT_SYMBOL_GPL(class_unregister);
 EXPORT_SYMBOL_GPL(class_destroy);
 
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index c29b836749b6..ec9b6460a38d 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -149,14 +149,6 @@ err_no_cmd:
 	return -EPERM;
 }
 
-static const void *bonding_namespace(struct class *cls,
-				     const struct class_attribute *attr)
-{
-	const struct bond_net *bn =
-		container_of(attr, struct bond_net, class_attr_bonding_masters);
-	return bn->net;
-}
-
 /* class attribute for bond_masters file.  This ends up in /sys/class/net */
 static const struct class_attribute class_attr_bonding_masters = {
 	.attr = {
@@ -165,7 +157,6 @@ static const struct class_attribute class_attr_bonding_masters = {
 	},
 	.show = bonding_show_bonds,
 	.store = bonding_store_bonds,
-	.namespace = bonding_namespace,
 };
 
 int bond_create_slave_symlinks(struct net_device *master,
@@ -1787,7 +1778,8 @@ int bond_create_sysfs(struct bond_net *bn)
 	bn->class_attr_bonding_masters = class_attr_bonding_masters;
 	sysfs_attr_init(&bn->class_attr_bonding_masters.attr);
 
-	ret = netdev_class_create_file(&bn->class_attr_bonding_masters);
+	ret = netdev_class_create_file_ns(&bn->class_attr_bonding_masters,
+					  bn->net);
 	/*
 	 * Permit multiple loads of the module by ignoring failures to
 	 * create the bonding_masters sysfs file.  Bonding devices
@@ -1817,7 +1809,7 @@ int bond_create_sysfs(struct bond_net *bn)
  */
 void bond_destroy_sysfs(struct bond_net *bn)
 {
-	netdev_class_remove_file(&bn->class_attr_bonding_masters);
+	netdev_class_remove_file_ns(&bn->class_attr_bonding_masters, bn->net);
 }
 
 /*
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 15ef5eb13663..e784340f1599 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -485,58 +485,15 @@ const struct file_operations sysfs_file_operations = {
 	.poll		= sysfs_poll,
 };
 
-static int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
-			 const void **pns)
-{
-	struct sysfs_dirent *dir_sd = kobj->sd;
-	const struct sysfs_ops *ops;
-	const void *ns = NULL;
-	int err;
-
-	if (!dir_sd) {
-		WARN(1, KERN_ERR "sysfs: kobject %s without dirent\n",
-			kobject_name(kobj));
-		return -ENOENT;
-	}
-
-	err = 0;
-	if (!sysfs_ns_type(dir_sd))
-		goto out;
-
-	err = -EINVAL;
-	if (!kobj->ktype)
-		goto out;
-	ops = kobj->ktype->sysfs_ops;
-	if (!ops)
-		goto out;
-	if (!ops->namespace)
-		goto out;
-
-	err = 0;
-	ns = ops->namespace(kobj, attr);
-out:
-	if (err) {
-		WARN(1, KERN_ERR
-		     "missing sysfs namespace attribute operation for kobject: %s\n",
-		     kobject_name(kobj));
-	}
-	*pns = ns;
-	return err;
-}
-
-int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
-			const struct attribute *attr, int type, umode_t amode)
+int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
+			   const struct attribute *attr, int type,
+			   umode_t amode, const void *ns)
 {
 	umode_t mode = (amode & S_IALLUGO) | S_IFREG;
 	struct sysfs_addrm_cxt acxt;
 	struct sysfs_dirent *sd;
-	const void *ns;
 	int rc;
 
-	rc = sysfs_attr_ns(dir_sd->s_dir.kobj, attr, &ns);
-	if (rc)
-		return rc;
-
 	sd = sysfs_new_dirent(attr->name, mode, type);
 	if (!sd)
 		return -ENOMEM;
@@ -559,23 +516,25 @@ int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
 int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
 		   int type)
 {
-	return sysfs_add_file_mode(dir_sd, attr, type, attr->mode);
+	return sysfs_add_file_mode_ns(dir_sd, attr, type, attr->mode, NULL);
 }
 
-
 /**
- *	sysfs_create_file - create an attribute file for an object.
- *	@kobj:	object we're creating for.
- *	@attr:	attribute descriptor.
+ * sysfs_create_file_ns - create an attribute file for an object with custom ns
+ * @kobj: object we're creating for
+ * @attr: attribute descriptor
+ * @ns: namespace the new file should belong to
  */
-int sysfs_create_file(struct kobject *kobj, const struct attribute *attr)
+int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
+			 const void *ns)
 {
 	BUG_ON(!kobj || !kobj->sd || !attr);
 
-	return sysfs_add_file(kobj->sd, attr, SYSFS_KOBJ_ATTR);
+	return sysfs_add_file_mode_ns(kobj->sd, attr, SYSFS_KOBJ_ATTR,
+				      attr->mode, ns);
 
 }
-EXPORT_SYMBOL_GPL(sysfs_create_file);
+EXPORT_SYMBOL_GPL(sysfs_create_file_ns);
 
 int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr)
 {
@@ -630,17 +589,12 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
 {
 	struct sysfs_dirent *sd;
 	struct iattr newattrs;
-	const void *ns;
 	int rc;
 
-	rc = sysfs_attr_ns(kobj, attr, &ns);
-	if (rc)
-		return rc;
-
 	mutex_lock(&sysfs_mutex);
 
 	rc = -ENOENT;
-	sd = sysfs_find_dirent(kobj->sd, ns, attr->name);
+	sd = sysfs_find_dirent(kobj->sd, NULL, attr->name);
 	if (!sd)
 		goto out;
 
@@ -655,22 +609,21 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
 EXPORT_SYMBOL_GPL(sysfs_chmod_file);
 
 /**
- *	sysfs_remove_file - remove an object attribute.
- *	@kobj:	object we're acting for.
- *	@attr:	attribute descriptor.
+ * sysfs_remove_file_ns - remove an object attribute with a custom ns tag
+ * @kobj: object we're acting for
+ * @attr: attribute descriptor
+ * @ns: namespace tag of the file to remove
  *
- *	Hash the attribute name and kill the victim.
+ * Hash the attribute name and namespace tag and kill the victim.
  */
-void sysfs_remove_file(struct kobject *kobj, const struct attribute *attr)
+void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
+			  const void *ns)
 {
-	const void *ns;
-
-	if (sysfs_attr_ns(kobj, attr, &ns))
-		return;
+	struct sysfs_dirent *dir_sd = kobj->sd;
 
-	sysfs_hash_and_remove(kobj->sd, ns, attr->name);
+	sysfs_hash_and_remove(dir_sd, ns, attr->name);
 }
-EXPORT_SYMBOL_GPL(sysfs_remove_file);
+EXPORT_SYMBOL_GPL(sysfs_remove_file_ns);
 
 void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr)
 {
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 5f92cd2f61c1..25c78f23dae8 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -56,9 +56,10 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 				if (!mode)
 					continue;
 			}
-			error = sysfs_add_file_mode(dir_sd, *attr,
-						    SYSFS_KOBJ_ATTR,
-						    (*attr)->mode | mode);
+			error = sysfs_add_file_mode_ns(dir_sd, *attr,
+						       SYSFS_KOBJ_ATTR,
+						       (*attr)->mode | mode,
+						       NULL);
 			if (unlikely(error))
 				break;
 		}
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index b6deca3e301d..a96da2559db2 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -230,8 +230,9 @@ extern const struct file_operations sysfs_file_operations;
 int sysfs_add_file(struct sysfs_dirent *dir_sd,
 		   const struct attribute *attr, int type);
 
-int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
-			const struct attribute *attr, int type, umode_t amode);
+int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
+			   const struct attribute *attr, int type,
+			   umode_t amode, const void *ns);
 /*
  * bin.c
  */
diff --git a/include/linux/device.h b/include/linux/device.h
index 2a9d6ed59579..ce690ea34547 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -427,8 +427,6 @@ struct class_attribute {
 			char *buf);
 	ssize_t (*store)(struct class *class, struct class_attribute *attr,
 			const char *buf, size_t count);
-	const void *(*namespace)(struct class *class,
-				 const struct class_attribute *attr);
 };
 
 #define CLASS_ATTR(_name, _mode, _show, _store) \
@@ -438,10 +436,24 @@ struct class_attribute {
 #define CLASS_ATTR_RO(_name) \
 	struct class_attribute class_attr_##_name = __ATTR_RO(_name)
 
-extern int __must_check class_create_file(struct class *class,
-					  const struct class_attribute *attr);
-extern void class_remove_file(struct class *class,
-			      const struct class_attribute *attr);
+extern int __must_check class_create_file_ns(struct class *class,
+					     const struct class_attribute *attr,
+					     const void *ns);
+extern void class_remove_file_ns(struct class *class,
+				 const struct class_attribute *attr,
+				 const void *ns);
+
+static inline int __must_check class_create_file(struct class *class,
+					const struct class_attribute *attr)
+{
+	return class_create_file_ns(class, attr, NULL);
+}
+
+static inline void class_remove_file(struct class *class,
+				     const struct class_attribute *attr)
+{
+	return class_remove_file_ns(class, attr, NULL);
+}
 
 /* Simple class attribute that is just a static string */
 struct class_attribute_string {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3de49aca4519..42421ed49a47 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2873,8 +2873,20 @@ extern int __init dev_proc_init(void);
 #define dev_proc_init() 0
 #endif
 
-extern int netdev_class_create_file(struct class_attribute *class_attr);
-extern void netdev_class_remove_file(struct class_attribute *class_attr);
+extern int netdev_class_create_file_ns(struct class_attribute *class_attr,
+				       const void *ns);
+extern void netdev_class_remove_file_ns(struct class_attribute *class_attr,
+					const void *ns);
+
+static inline int netdev_class_create_file(struct class_attribute *class_attr)
+{
+	return netdev_class_create_file_ns(class_attr, NULL);
+}
+
+static inline void netdev_class_remove_file(struct class_attribute *class_attr)
+{
+	netdev_class_remove_file_ns(class_attr, NULL);
+}
 
 extern struct kobj_ns_type_operations net_ns_type_operations;
 
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 11baec7c9b26..82f7fac78e77 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -173,7 +173,6 @@ struct bin_attribute bin_attr_##_name = __BIN_ATTR_RW(_name, _size)
 struct sysfs_ops {
 	ssize_t	(*show)(struct kobject *, struct attribute *, char *);
 	ssize_t	(*store)(struct kobject *, struct attribute *, const char *, size_t);
-	const void *(*namespace)(struct kobject *, const struct attribute *);
 };
 
 struct sysfs_dirent;
@@ -189,13 +188,15 @@ int __must_check sysfs_rename_dir(struct kobject *kobj, const char *new_name);
 int __must_check sysfs_move_dir(struct kobject *kobj,
 				struct kobject *new_parent_kobj);
 
-int __must_check sysfs_create_file(struct kobject *kobj,
-				   const struct attribute *attr);
+int __must_check sysfs_create_file_ns(struct kobject *kobj,
+				      const struct attribute *attr,
+				      const void *ns);
 int __must_check sysfs_create_files(struct kobject *kobj,
 				   const struct attribute **attr);
 int __must_check sysfs_chmod_file(struct kobject *kobj,
 				  const struct attribute *attr, umode_t mode);
-void sysfs_remove_file(struct kobject *kobj, const struct attribute *attr);
+void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
+			  const void *ns);
 void sysfs_remove_files(struct kobject *kobj, const struct attribute **attr);
 
 int __must_check sysfs_create_bin_file(struct kobject *kobj,
@@ -277,8 +278,9 @@ static inline int sysfs_move_dir(struct kobject *kobj,
 	return 0;
 }
 
-static inline int sysfs_create_file(struct kobject *kobj,
-				    const struct attribute *attr)
+static inline int sysfs_create_file_ns(struct kobject *kobj,
+				       const struct attribute *attr,
+				       const void *ns)
 {
 	return 0;
 }
@@ -295,8 +297,9 @@ static inline int sysfs_chmod_file(struct kobject *kobj,
 	return 0;
 }
 
-static inline void sysfs_remove_file(struct kobject *kobj,
-				     const struct attribute *attr)
+static inline void sysfs_remove_file_ns(struct kobject *kobj,
+					const struct attribute *attr,
+					const void *ns)
 {
 }
 
@@ -435,4 +438,16 @@ static inline int __must_check sysfs_init(void)
 
 #endif /* CONFIG_SYSFS */
 
+static inline int __must_check sysfs_create_file(struct kobject *kobj,
+						 const struct attribute *attr)
+{
+	return sysfs_create_file_ns(kobj, attr, NULL);
+}
+
+static inline void sysfs_remove_file(struct kobject *kobj,
+				     const struct attribute *attr)
+{
+	return sysfs_remove_file_ns(kobj, attr, NULL);
+}
+
 #endif /* _SYSFS_H_ */
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index d954b56b4e47..325dee863e46 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1344,17 +1344,19 @@ int netdev_register_kobject(struct net_device *net)
 	return error;
 }
 
-int netdev_class_create_file(struct class_attribute *class_attr)
+int netdev_class_create_file_ns(struct class_attribute *class_attr,
+				const void *ns)
 {
-	return class_create_file(&net_class, class_attr);
+	return class_create_file_ns(&net_class, class_attr, ns);
 }
-EXPORT_SYMBOL(netdev_class_create_file);
+EXPORT_SYMBOL(netdev_class_create_file_ns);
 
-void netdev_class_remove_file(struct class_attribute *class_attr)
+void netdev_class_remove_file_ns(struct class_attribute *class_attr,
+				 const void *ns)
 {
-	class_remove_file(&net_class, class_attr);
+	class_remove_file_ns(&net_class, class_attr, ns);
 }
-EXPORT_SYMBOL(netdev_class_remove_file);
+EXPORT_SYMBOL(netdev_class_remove_file_ns);
 
 int netdev_kobject_init(void)
 {
-- 
cgit v1.3


From e34ff4906199d2ebd248ae897ae34f52bea151c9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 11 Sep 2013 22:29:05 -0400
Subject: sysfs: remove ktype->namespace() invocations in directory code

For some unrecognizable reason, namespace information is communicated
to sysfs through ktype->namespace() callback when there's *nothing*
which needs the use of a callback.  The whole sequence of operations
is completely synchronous and sysfs operations simply end up calling
back into the layer which just invoked it in order to find out the
namespace information, which is completely backwards, obfuscates
what's going on and unnecessarily tangles two separate layers.

This patch doesn't remove ktype->namespace() but shifts its handling
to kobject layer.  We probably want to get rid of the callback in the
long term.

This patch adds an explicit param to sysfs_{create|rename|move}_dir()
and renames them to sysfs_{create|rename|move}_dir_ns(), respectively.
ktype->namespace() invocations are moved to the calling sites of the
above functions.  A new helper kboject_namespace() is introduced which
directly tests kobj_ns_type_operations->type which should give the
same result as testing sysfs_fs_type(parent_sd) and returns @kobj's
namespace tag as necessary.  kobject_namespace() is extern as it will
be used from another file in the following patches.

This patch should be an equivalent conversion without any functional
difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Kay Sievers <kay@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c          | 23 ++++++++---------------
 include/linux/kobject.h |  1 +
 include/linux/sysfs.h   | 20 ++++++++++++--------
 lib/kobject.c           | 28 ++++++++++++++++++++++++----
 4 files changed, 45 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 834c64cb7f88..878ac3afe1b8 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -730,14 +730,14 @@ static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
 }
 
 /**
- *	sysfs_create_dir - create a directory for an object.
- *	@kobj:		object we're creating directory for.
+ * sysfs_create_dir_ns - create a directory for an object with a namespace tag
+ * @kobj: object we're creating directory for
+ * @ns: the namespace tag to use
  */
-int sysfs_create_dir(struct kobject *kobj)
+int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 {
 	enum kobj_ns_type type;
 	struct sysfs_dirent *parent_sd, *sd;
-	const void *ns = NULL;
 	int error = 0;
 
 	BUG_ON(!kobj);
@@ -750,8 +750,6 @@ int sysfs_create_dir(struct kobject *kobj)
 	if (!parent_sd)
 		return -ENOENT;
 
-	if (sysfs_ns_type(parent_sd))
-		ns = kobj->ktype->namespace(kobj);
 	type = sysfs_read_ns_type(kobj);
 
 	error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd);
@@ -909,26 +907,21 @@ int sysfs_rename(struct sysfs_dirent *sd,
 	return error;
 }
 
-int sysfs_rename_dir(struct kobject *kobj, const char *new_name)
+int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
+			const void *new_ns)
 {
 	struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
-	const void *new_ns = NULL;
-
-	if (sysfs_ns_type(parent_sd))
-		new_ns = kobj->ktype->namespace(kobj);
 
 	return sysfs_rename(kobj->sd, parent_sd, new_ns, new_name);
 }
 
-int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
+int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
+		      const void *new_ns)
 {
 	struct sysfs_dirent *sd = kobj->sd;
 	struct sysfs_dirent *new_parent_sd;
-	const void *new_ns = NULL;
 
 	BUG_ON(!sd->s_parent);
-	if (sysfs_ns_type(sd->s_parent))
-		new_ns = kobj->ktype->namespace(kobj);
 	new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
 		new_parent_kobj->sd : &sysfs_root;
 
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index de6dcbcc6ef7..e7ba650086ce 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -107,6 +107,7 @@ extern int __must_check kobject_move(struct kobject *, struct kobject *);
 extern struct kobject *kobject_get(struct kobject *kobj);
 extern void kobject_put(struct kobject *kobj);
 
+extern const void *kobject_namespace(struct kobject *kobj);
 extern char *kobject_get_path(struct kobject *kobj, gfp_t flag);
 
 struct kobj_type {
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 82f7fac78e77..7f56bad3be82 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -182,11 +182,13 @@ struct sysfs_dirent;
 int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
 			    void *data, struct module *owner);
 
-int __must_check sysfs_create_dir(struct kobject *kobj);
+int __must_check sysfs_create_dir_ns(struct kobject *kobj, const void *ns);
 void sysfs_remove_dir(struct kobject *kobj);
-int __must_check sysfs_rename_dir(struct kobject *kobj, const char *new_name);
-int __must_check sysfs_move_dir(struct kobject *kobj,
-				struct kobject *new_parent_kobj);
+int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
+				     const void *new_ns);
+int __must_check sysfs_move_dir_ns(struct kobject *kobj,
+				   struct kobject *new_parent_kobj,
+				   const void *new_ns);
 
 int __must_check sysfs_create_file_ns(struct kobject *kobj,
 				      const struct attribute *attr,
@@ -258,7 +260,7 @@ static inline int sysfs_schedule_callback(struct kobject *kobj,
 	return -ENOSYS;
 }
 
-static inline int sysfs_create_dir(struct kobject *kobj)
+static inline int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 {
 	return 0;
 }
@@ -267,13 +269,15 @@ static inline void sysfs_remove_dir(struct kobject *kobj)
 {
 }
 
-static inline int sysfs_rename_dir(struct kobject *kobj, const char *new_name)
+static inline int sysfs_rename_dir_ns(struct kobject *kobj,
+				      const char *new_name, const void *new_ns)
 {
 	return 0;
 }
 
-static inline int sysfs_move_dir(struct kobject *kobj,
-				 struct kobject *new_parent_kobj)
+static inline int sysfs_move_dir_ns(struct kobject *kobj,
+				    struct kobject *new_parent_kobj,
+				    const void *new_ns)
 {
 	return 0;
 }
diff --git a/lib/kobject.c b/lib/kobject.c
index 962175134702..85fb3a161b21 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -18,6 +18,24 @@
 #include <linux/stat.h>
 #include <linux/slab.h>
 
+/**
+ * kobject_namespace - return @kobj's namespace tag
+ * @kobj: kobject in question
+ *
+ * Returns namespace tag of @kobj if its parent has namespace ops enabled
+ * and thus @kobj should have a namespace tag associated with it.  Returns
+ * %NULL otherwise.
+ */
+const void *kobject_namespace(struct kobject *kobj)
+{
+	const struct kobj_ns_type_operations *ns_ops = kobj_ns_ops(kobj);
+
+	if (!ns_ops || ns_ops->type == KOBJ_NS_TYPE_NONE)
+		return NULL;
+
+	return kobj->ktype->namespace(kobj);
+}
+
 /*
  * populate_dir - populate directory with attributes.
  * @kobj: object we're working on.
@@ -46,8 +64,9 @@ static int populate_dir(struct kobject *kobj)
 
 static int create_dir(struct kobject *kobj)
 {
-	int error = 0;
-	error = sysfs_create_dir(kobj);
+	int error;
+
+	error = sysfs_create_dir_ns(kobj, kobject_namespace(kobj));
 	if (!error) {
 		error = populate_dir(kobj);
 		if (error)
@@ -428,7 +447,7 @@ int kobject_rename(struct kobject *kobj, const char *new_name)
 		goto out;
 	}
 
-	error = sysfs_rename_dir(kobj, new_name);
+	error = sysfs_rename_dir_ns(kobj, new_name, kobject_namespace(kobj));
 	if (error)
 		goto out;
 
@@ -472,6 +491,7 @@ int kobject_move(struct kobject *kobj, struct kobject *new_parent)
 		if (kobj->kset)
 			new_parent = kobject_get(&kobj->kset->kobj);
 	}
+
 	/* old object path */
 	devpath = kobject_get_path(kobj, GFP_KERNEL);
 	if (!devpath) {
@@ -486,7 +506,7 @@ int kobject_move(struct kobject *kobj, struct kobject *new_parent)
 	sprintf(devpath_string, "DEVPATH_OLD=%s", devpath);
 	envp[0] = devpath_string;
 	envp[1] = NULL;
-	error = sysfs_move_dir(kobj, new_parent);
+	error = sysfs_move_dir_ns(kobj, new_parent, kobject_namespace(kobj));
 	if (error)
 		goto out;
 	old_parent = kobj->parent;
-- 
cgit v1.3


From 4b30ee58ee64c64f59fd876e4afa6ed82caef3a4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 11 Sep 2013 22:29:06 -0400
Subject: sysfs: remove ktype->namespace() invocations in symlink code

There's no reason for sysfs to be calling ktype->namespace().  It is
backwards, obfuscates what's going on and unnecessarily tangles two
separate layers.

There are two places where symlink code calls ktype->namespace().

* sysfs_do_create_link_sd() calls it to find out the namespace tag of
  the target directory.  Unless symlinking races with cross-namespace
  renaming, this equals @target_sd->s_ns.

* sysfs_rename_link() uses it to find out the new namespace to rename
  to and the new namespace can be different from the existing one.
  The function is renamed to sysfs_rename_link_ns() with an explicit
  @ns argument and the ktype->namespace() invocation is shifted to the
  device layer.

While this patch replaces ktype->namespace() invocation with the
recorded result in @target_sd, this shouldn't result in any behvior
difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Kay Sievers <kay@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c   |  8 +++++---
 fs/sysfs/symlink.c    | 16 +++++++---------
 include/linux/sysfs.h | 16 ++++++++++++----
 3 files changed, 24 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index c7cfadcf6752..3335000be2dc 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1881,6 +1881,7 @@ EXPORT_SYMBOL_GPL(device_destroy);
  */
 int device_rename(struct device *dev, const char *new_name)
 {
+	struct kobject *kobj = &dev->kobj;
 	char *old_device_name = NULL;
 	int error;
 
@@ -1898,13 +1899,14 @@ int device_rename(struct device *dev, const char *new_name)
 	}
 
 	if (dev->class) {
-		error = sysfs_rename_link(&dev->class->p->subsys.kobj,
-			&dev->kobj, old_device_name, new_name);
+		error = sysfs_rename_link_ns(&dev->class->p->subsys.kobj,
+					     kobj, old_device_name,
+					     new_name, kobject_namespace(kobj));
 		if (error)
 			goto out;
 	}
 
-	error = kobject_rename(&dev->kobj, new_name);
+	error = kobject_rename(kobj, new_name);
 	if (error)
 		goto out;
 
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 2dd4507d9edd..12d58ada3e6d 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -52,7 +52,7 @@ static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
 
 	ns_type = sysfs_ns_type(parent_sd);
 	if (ns_type)
-		sd->s_ns = target->ktype->namespace(target);
+		sd->s_ns = target_sd->s_ns;
 	sd->s_symlink.target_sd = target_sd;
 	target_sd = NULL;	/* reference is now owned by the symlink */
 
@@ -181,19 +181,20 @@ void sysfs_remove_link(struct kobject *kobj, const char *name)
 EXPORT_SYMBOL_GPL(sysfs_remove_link);
 
 /**
- *	sysfs_rename_link - rename symlink in object's directory.
+ *	sysfs_rename_link_ns - rename symlink in object's directory.
  *	@kobj:	object we're acting for.
  *	@targ:	object we're pointing to.
  *	@old:	previous name of the symlink.
  *	@new:	new name of the symlink.
+ *	@new_ns: new namespace of the symlink.
  *
  *	A helper function for the common rename symlink idiom.
  */
-int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
-			const char *old, const char *new)
+int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
+			 const char *old, const char *new, const void *new_ns)
 {
 	struct sysfs_dirent *parent_sd, *sd = NULL;
-	const void *old_ns = NULL, *new_ns = NULL;
+	const void *old_ns = NULL;
 	int result;
 
 	if (!kobj)
@@ -215,16 +216,13 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
 	if (sd->s_symlink.target_sd->s_dir.kobj != targ)
 		goto out;
 
-	if (sysfs_ns_type(parent_sd))
-		new_ns = targ->ktype->namespace(targ);
-
 	result = sysfs_rename(sd, parent_sd, new_ns, new);
 
 out:
 	sysfs_put(sd);
 	return result;
 }
-EXPORT_SYMBOL_GPL(sysfs_rename_link);
+EXPORT_SYMBOL_GPL(sysfs_rename_link_ns);
 
 static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
 				 struct sysfs_dirent *target_sd, char *path)
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 7f56bad3be82..c792f73ac7fa 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -213,8 +213,9 @@ int __must_check sysfs_create_link_nowarn(struct kobject *kobj,
 					  const char *name);
 void sysfs_remove_link(struct kobject *kobj, const char *name);
 
-int sysfs_rename_link(struct kobject *kobj, struct kobject *target,
-			const char *old_name, const char *new_name);
+int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *target,
+			 const char *old_name, const char *new_name,
+			 const void *new_ns);
 
 void sysfs_delete_link(struct kobject *dir, struct kobject *targ,
 			const char *name);
@@ -340,8 +341,9 @@ static inline void sysfs_remove_link(struct kobject *kobj, const char *name)
 {
 }
 
-static inline int sysfs_rename_link(struct kobject *k, struct kobject *t,
-				    const char *old_name, const char *new_name)
+static inline int sysfs_rename_link_ns(struct kobject *k, struct kobject *t,
+				       const char *old_name,
+				       const char *new_name, const void *ns)
 {
 	return 0;
 }
@@ -454,4 +456,10 @@ static inline void sysfs_remove_file(struct kobject *kobj,
 	return sysfs_remove_file_ns(kobj, attr, NULL);
 }
 
+static inline int sysfs_rename_link(struct kobject *kobj, struct kobject *target,
+				    const char *old_name, const char *new_name)
+{
+	return sysfs_rename_link_ns(kobj, target, old_name, new_name, NULL);
+}
+
 #endif /* _SYSFS_H_ */
-- 
cgit v1.3


From cb26a311578e67769e92a39a0a63476533cb7e12 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 11 Sep 2013 22:29:07 -0400
Subject: sysfs: drop kobj_ns_type handling

The way namespace tags are implemented in sysfs is more complicated
than necessary.  As each tag is a pointer value and required to be
non-NULL under a namespace enabled parent, there's no need to record
separately what type each tag is or where namespace is enabled.

If multiple namespace types are needed, which currently aren't, we can
simply compare the tag to a set of allowed tags in the superblock
assuming that the tags, being pointers, won't have the same value
across multiple types.  Also, whether to filter by namespace tag or
not can be trivially determined by whether the node has any tagged
children or not.

This patch rips out kobj_ns_type handling from sysfs.  sysfs no longer
cares whether specific type of namespace is enabled or not.  If a
sysfs_dirent has a non-NULL tag, the parent is marked as needing
namespace filtering and the value is tested against the allowed set of
tags for the superblock (currently only one but increasing this number
isn't difficult) and the sysfs_dirent is ignored if it doesn't match.

This removes most kobject namespace knowledge from sysfs proper which
will enable proper separation and layering of sysfs.  The namespace
sanity checks in fs/sysfs/dir.c are replaced by the new sanity check
in kobject_namespace().  As this is the only place ktype->namespace()
is called for sysfs, this doesn't weaken the sanity check
significantly.  I omitted converting the sanity check in
sysfs_do_create_link_sd().  While the check can be shifted to upper
layer, mistakes there are well contained and should be easily visible
anyway.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Kay Sievers <kay@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c     | 90 +++++++++++++++---------------------------------------
 fs/sysfs/mount.c   | 24 ++++-----------
 fs/sysfs/symlink.c | 27 ++++------------
 fs/sysfs/sysfs.h   | 25 +++++----------
 lib/kobject.c      |  5 ++-
 5 files changed, 48 insertions(+), 123 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 878ac3afe1b8..1dfb4aaf9446 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -111,6 +111,11 @@ static int sysfs_link_sibling(struct sysfs_dirent *sd)
 	/* add new node and rebalance the tree */
 	rb_link_node(&sd->s_rb, parent, node);
 	rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children);
+
+	/* if @sd has ns tag, mark the parent to enable ns filtering */
+	if (sd->s_ns)
+		sd->s_parent->s_flags |= SYSFS_FLAG_HAS_NS;
+
 	return 0;
 }
 
@@ -130,6 +135,13 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
 		sd->s_parent->s_dir.subdirs--;
 
 	rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children);
+
+	/*
+	 * Either all or none of the children have tags.  Clearing HAS_NS
+	 * when there's no child left is enough to keep the flag synced.
+	 */
+	if (RB_EMPTY_ROOT(&sd->s_parent->s_dir.children))
+		sd->s_parent->s_flags &= ~SYSFS_FLAG_HAS_NS;
 }
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -297,7 +309,6 @@ static int sysfs_dentry_delete(const struct dentry *dentry)
 static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct sysfs_dirent *sd;
-	int type;
 
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
@@ -318,13 +329,8 @@ static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
 		goto out_bad;
 
 	/* The sysfs dirent has been moved to a different namespace */
-	type = KOBJ_NS_TYPE_NONE;
-	if (sd->s_parent) {
-		type = sysfs_ns_type(sd->s_parent);
-		if (type != KOBJ_NS_TYPE_NONE &&
-				sysfs_info(dentry->d_sb)->ns[type] != sd->s_ns)
-			goto out_bad;
-	}
+	if (sd->s_ns && sd->s_ns != sysfs_info(dentry->d_sb)->ns)
+		goto out_bad;
 
 	mutex_unlock(&sysfs_mutex);
 out_valid:
@@ -445,13 +451,6 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 	struct sysfs_inode_attrs *ps_iattr;
 	int ret;
 
-	if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) {
-		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
-			sysfs_ns_type(acxt->parent_sd) ? "required" : "invalid",
-			acxt->parent_sd->s_name, sd->s_name);
-		return -EINVAL;
-	}
-
 	sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name);
 	sd->s_parent = sysfs_get(acxt->parent_sd);
 
@@ -613,13 +612,6 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 	struct rb_node *node = parent_sd->s_dir.children.rb_node;
 	unsigned int hash;
 
-	if (!!sysfs_ns_type(parent_sd) != !!ns) {
-		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
-			sysfs_ns_type(parent_sd) ? "required" : "invalid",
-			parent_sd->s_name, name);
-		return NULL;
-	}
-
 	hash = sysfs_name_hash(ns, name);
 	while (node) {
 		struct sysfs_dirent *sd;
@@ -667,8 +659,7 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
 EXPORT_SYMBOL_GPL(sysfs_get_dirent);
 
 static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
-	enum kobj_ns_type type, const void *ns, const char *name,
-	struct sysfs_dirent **p_sd)
+	const void *ns, const char *name, struct sysfs_dirent **p_sd)
 {
 	umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
 	struct sysfs_addrm_cxt acxt;
@@ -680,7 +671,6 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 	if (!sd)
 		return -ENOMEM;
 
-	sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT);
 	sd->s_ns = ns;
 	sd->s_dir.kobj = kobj;
 
@@ -700,33 +690,7 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 int sysfs_create_subdir(struct kobject *kobj, const char *name,
 			struct sysfs_dirent **p_sd)
 {
-	return create_dir(kobj, kobj->sd,
-			  KOBJ_NS_TYPE_NONE, NULL, name, p_sd);
-}
-
-/**
- *	sysfs_read_ns_type: return associated ns_type
- *	@kobj: the kobject being queried
- *
- *	Each kobject can be tagged with exactly one namespace type
- *	(i.e. network or user).  Return the ns_type associated with
- *	this object if any
- */
-static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
-{
-	const struct kobj_ns_type_operations *ops;
-	enum kobj_ns_type type;
-
-	ops = kobj_child_ns_ops(kobj);
-	if (!ops)
-		return KOBJ_NS_TYPE_NONE;
-
-	type = ops->type;
-	BUG_ON(type <= KOBJ_NS_TYPE_NONE);
-	BUG_ON(type >= KOBJ_NS_TYPES);
-	BUG_ON(!kobj_ns_type_registered(type));
-
-	return type;
+	return create_dir(kobj, kobj->sd, NULL, name, p_sd);
 }
 
 /**
@@ -736,7 +700,6 @@ static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
  */
 int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 {
-	enum kobj_ns_type type;
 	struct sysfs_dirent *parent_sd, *sd;
 	int error = 0;
 
@@ -750,9 +713,7 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 	if (!parent_sd)
 		return -ENOENT;
 
-	type = sysfs_read_ns_type(kobj);
-
-	error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd);
+	error = create_dir(kobj, parent_sd, ns, kobject_name(kobj), &sd);
 	if (!error)
 		kobj->sd = sd;
 	return error;
@@ -766,13 +727,12 @@ static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
 	struct sysfs_dirent *parent_sd = parent->d_fsdata;
 	struct sysfs_dirent *sd;
 	struct inode *inode;
-	enum kobj_ns_type type;
-	const void *ns;
+	const void *ns = NULL;
 
 	mutex_lock(&sysfs_mutex);
 
-	type = sysfs_ns_type(parent_sd);
-	ns = sysfs_info(dir->i_sb)->ns[type];
+	if (parent_sd->s_flags & SYSFS_FLAG_HAS_NS)
+		ns = sysfs_info(dir->i_sb)->ns;
 
 	sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name);
 
@@ -995,15 +955,15 @@ static int sysfs_readdir(struct file *file, struct dir_context *ctx)
 	struct dentry *dentry = file->f_path.dentry;
 	struct sysfs_dirent *parent_sd = dentry->d_fsdata;
 	struct sysfs_dirent *pos = file->private_data;
-	enum kobj_ns_type type;
-	const void *ns;
-
-	type = sysfs_ns_type(parent_sd);
-	ns = sysfs_info(dentry->d_sb)->ns[type];
+	const void *ns = NULL;
 
 	if (!dir_emit_dots(file, ctx))
 		return 0;
 	mutex_lock(&sysfs_mutex);
+
+	if (parent_sd->s_flags & SYSFS_FLAG_HAS_NS)
+		ns = sysfs_info(dentry->d_sb)->ns;
+
 	for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
 	     pos;
 	     pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 834ec2cdb7a3..8c24bce2f4ae 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -36,7 +36,7 @@ static const struct super_operations sysfs_ops = {
 struct sysfs_dirent sysfs_root = {
 	.s_name		= "",
 	.s_count	= ATOMIC_INIT(1),
-	.s_flags	= SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
+	.s_flags	= SYSFS_DIR,
 	.s_mode		= S_IFDIR | S_IRUGO | S_IXUGO,
 	.s_ino		= 1,
 };
@@ -77,14 +77,8 @@ static int sysfs_test_super(struct super_block *sb, void *data)
 {
 	struct sysfs_super_info *sb_info = sysfs_info(sb);
 	struct sysfs_super_info *info = data;
-	enum kobj_ns_type type;
-	int found = 1;
 
-	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
-		if (sb_info->ns[type] != info->ns[type])
-			found = 0;
-	}
-	return found;
+	return sb_info->ns == info->ns;
 }
 
 static int sysfs_set_super(struct super_block *sb, void *data)
@@ -98,9 +92,7 @@ static int sysfs_set_super(struct super_block *sb, void *data)
 
 static void free_sysfs_super_info(struct sysfs_super_info *info)
 {
-	int type;
-	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
-		kobj_ns_drop(type, info->ns[type]);
+	kobj_ns_drop(KOBJ_NS_TYPE_NET, info->ns);
 	kfree(info);
 }
 
@@ -108,7 +100,6 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
 	struct sysfs_super_info *info;
-	enum kobj_ns_type type;
 	struct super_block *sb;
 	int error;
 
@@ -116,18 +107,15 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 		if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
 			return ERR_PTR(-EPERM);
 
-		for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
-			if (!kobj_ns_current_may_mount(type))
-				return ERR_PTR(-EPERM);
-		}
+		if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
+			return ERR_PTR(-EPERM);
 	}
 
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
 		return ERR_PTR(-ENOMEM);
 
-	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
-		info->ns[type] = kobj_ns_grab_current(type);
+	info->ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
 
 	sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info);
 	if (IS_ERR(sb) || sb->s_fs_info != info)
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 12d58ada3e6d..7d981ce2e87f 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -28,7 +28,6 @@ static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
 	struct sysfs_dirent *target_sd = NULL;
 	struct sysfs_dirent *sd = NULL;
 	struct sysfs_addrm_cxt acxt;
-	enum kobj_ns_type ns_type;
 	int error;
 
 	BUG_ON(!name || !parent_sd);
@@ -50,29 +49,15 @@ static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
 	if (!sd)
 		goto out_put;
 
-	ns_type = sysfs_ns_type(parent_sd);
-	if (ns_type)
-		sd->s_ns = target_sd->s_ns;
+	sd->s_ns = target_sd->s_ns;
 	sd->s_symlink.target_sd = target_sd;
 	target_sd = NULL;	/* reference is now owned by the symlink */
 
 	sysfs_addrm_start(&acxt, parent_sd);
-	/* Symlinks must be between directories with the same ns_type */
-	if (!ns_type ||
-	    (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) {
-		if (warn)
-			error = sysfs_add_one(&acxt, sd);
-		else
-			error = __sysfs_add_one(&acxt, sd);
-	} else {
-		error = -EINVAL;
-		WARN(1, KERN_WARNING
-			"sysfs: symlink across ns_types %s/%s -> %s/%s\n",
-			parent_sd->s_name,
-			sd->s_name,
-			sd->s_symlink.target_sd->s_parent->s_name,
-			sd->s_symlink.target_sd->s_name);
-	}
+	if (warn)
+		error = sysfs_add_one(&acxt, sd);
+	else
+		error = __sysfs_add_one(&acxt, sd);
 	sysfs_addrm_finish(&acxt);
 
 	if (error)
@@ -156,7 +141,7 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
 {
 	const void *ns = NULL;
 	spin_lock(&sysfs_assoc_lock);
-	if (targ->sd && sysfs_ns_type(kobj->sd))
+	if (targ->sd)
 		ns = targ->sd->s_ns;
 	spin_unlock(&sysfs_assoc_lock);
 	sysfs_hash_and_remove(kobj->sd, ns, name);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index a96da2559db2..7664d1b3d594 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -93,11 +93,8 @@ struct sysfs_dirent {
 #define SYSFS_COPY_NAME			(SYSFS_DIR | SYSFS_KOBJ_LINK)
 #define SYSFS_ACTIVE_REF		(SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
 
-/* identify any namespace tag on sysfs_dirents */
-#define SYSFS_NS_TYPE_MASK		0xf00
-#define SYSFS_NS_TYPE_SHIFT		8
-
-#define SYSFS_FLAG_MASK			~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
+#define SYSFS_FLAG_MASK			~SYSFS_TYPE_MASK
+#define SYSFS_FLAG_HAS_NS		0x01000
 #define SYSFS_FLAG_REMOVED		0x02000
 
 static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
@@ -105,15 +102,6 @@ static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
 	return sd->s_flags & SYSFS_TYPE_MASK;
 }
 
-/*
- * Return any namespace tags on this dirent.
- * enum kobj_ns_type is defined in linux/kobject.h
- */
-static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
-{
-	return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;
-}
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define sysfs_dirent_init_lockdep(sd)				\
 do {								\
@@ -141,12 +129,13 @@ struct sysfs_addrm_cxt {
  */
 
 /*
- * Each sb is associated with a set of namespace tags (i.e.
- * the network namespace of the task which mounted this sysfs
- * instance).
+ * Each sb is associated with one namespace tag, currently the network
+ * namespace of the task which mounted this sysfs instance.  If multiple
+ * tags become necessary, make the following an array and compare
+ * sysfs_dirent tag against every entry.
  */
 struct sysfs_super_info {
-	void *ns[KOBJ_NS_TYPES];
+	void *ns;
 };
 #define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
 extern struct sysfs_dirent sysfs_root;
diff --git a/lib/kobject.c b/lib/kobject.c
index 85fb3a161b21..e769ee3c2fb9 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -29,11 +29,14 @@
 const void *kobject_namespace(struct kobject *kobj)
 {
 	const struct kobj_ns_type_operations *ns_ops = kobj_ns_ops(kobj);
+	const void *ns;
 
 	if (!ns_ops || ns_ops->type == KOBJ_NS_TYPE_NONE)
 		return NULL;
 
-	return kobj->ktype->namespace(kobj);
+	ns = kobj->ktype->namespace(kobj);
+	WARN_ON(!ns);	/* @kobj in a namespace is required to have !NULL tag */
+	return ns;
 }
 
 /*
-- 
cgit v1.3


From 388975cccaaf11abd47525f664c76891c440481a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 11 Sep 2013 23:19:13 -0400
Subject: sysfs: clean up sysfs_get_dirent()

The pre-existing sysfs interfaces which take explicit namespace
argument are weird in that they place the optional @ns in front of
@name which is contrary to the established convention.  For example,
we end up forcing vast majority of sysfs_get_dirent() users to do
sysfs_get_dirent(parent, NULL, name), which is silly and error-prone
especially as @ns and @name may be interchanged without causing
compilation warning.

This renames sysfs_get_dirent() to sysfs_get_dirent_ns() and swap the
positions of @name and @ns, and sysfs_get_dirent() is now a wrapper
around sysfs_get_dirent_ns().  This makes confusions a lot less
likely.

There are other interfaces which take @ns before @name.  They'll be
updated by following patches.

This patch doesn't introduce any functional changes.

v2: EXPORT_SYMBOL_GPL() wasn't updated leading to undefined symbol
    error on module builds.  Reported by build test robot.  Fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Kay Sievers <kay@vrfy.org>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpio/gpiolib.c |  2 +-
 drivers/md/bitmap.c    |  4 ++--
 drivers/md/md.c        |  2 +-
 drivers/md/md.h        |  2 +-
 fs/sysfs/dir.c         | 11 ++++++-----
 fs/sysfs/file.c        |  4 ++--
 fs/sysfs/group.c       | 10 +++++-----
 fs/sysfs/symlink.c     |  2 +-
 fs/sysfs/sysfs.h       |  3 ---
 include/linux/sysfs.h  | 19 ++++++++++++-------
 10 files changed, 31 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 86ef3461ec06..a094356020a6 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -408,7 +408,7 @@ static int gpio_setup_irq(struct gpio_desc *desc, struct device *dev,
 			IRQF_TRIGGER_FALLING : IRQF_TRIGGER_RISING;
 
 	if (!value_sd) {
-		value_sd = sysfs_get_dirent(dev->kobj.sd, NULL, "value");
+		value_sd = sysfs_get_dirent(dev->kobj.sd, "value");
 		if (!value_sd) {
 			ret = -ENODEV;
 			goto err_out;
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index a7fd82133b12..12dc29ba7399 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1654,9 +1654,9 @@ int bitmap_create(struct mddev *mddev)
 	bitmap->mddev = mddev;
 
 	if (mddev->kobj.sd)
-		bm = sysfs_get_dirent(mddev->kobj.sd, NULL, "bitmap");
+		bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap");
 	if (bm) {
-		bitmap->sysfs_can_clear = sysfs_get_dirent(bm, NULL, "can_clear");
+		bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear");
 		sysfs_put(bm);
 	} else
 		bitmap->sysfs_can_clear = NULL;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index adf4d7e1d5e1..8a0d7625681c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3555,7 +3555,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
 			printk(KERN_WARNING
 			       "md: cannot register extra attributes for %s\n",
 			       mdname(mddev));
-		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action");
+		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
 	}		
 	if (mddev->pers->sync_request != NULL &&
 	    pers->sync_request == NULL) {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 608050c43f17..b0051f2fbc0c 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -501,7 +501,7 @@ extern struct attribute_group md_bitmap_group;
 static inline struct sysfs_dirent *sysfs_get_dirent_safe(struct sysfs_dirent *sd, char *name)
 {
 	if (sd)
-		return sysfs_get_dirent(sd, NULL, name);
+		return sysfs_get_dirent(sd, name);
 	return sd;
 }
 static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd)
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 1dfb4aaf9446..fee19d16e4a2 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -630,9 +630,10 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 }
 
 /**
- *	sysfs_get_dirent - find and get sysfs_dirent with the given name
+ *	sysfs_get_dirent_ns - find and get sysfs_dirent with the given name
  *	@parent_sd: sysfs_dirent to search under
  *	@name: name to look for
+ *	@ns: the namespace tag to use
  *
  *	Look for sysfs_dirent with name @name under @parent_sd and get
  *	it if found.
@@ -643,9 +644,9 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
  *	RETURNS:
  *	Pointer to sysfs_dirent if found, NULL if not.
  */
-struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
-				      const void *ns,
-				      const unsigned char *name)
+struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd,
+					 const unsigned char *name,
+					 const void *ns)
 {
 	struct sysfs_dirent *sd;
 
@@ -656,7 +657,7 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
 
 	return sd;
 }
-EXPORT_SYMBOL_GPL(sysfs_get_dirent);
+EXPORT_SYMBOL_GPL(sysfs_get_dirent_ns);
 
 static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 	const void *ns, const char *name, struct sysfs_dirent **p_sd)
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index e784340f1599..0f3214a70985 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -563,7 +563,7 @@ int sysfs_add_file_to_group(struct kobject *kobj,
 	int error;
 
 	if (group)
-		dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
+		dir_sd = sysfs_get_dirent(kobj->sd, group);
 	else
 		dir_sd = sysfs_get(kobj->sd);
 
@@ -645,7 +645,7 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
 	struct sysfs_dirent *dir_sd;
 
 	if (group)
-		dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
+		dir_sd = sysfs_get_dirent(kobj->sd, group);
 	else
 		dir_sd = sysfs_get(kobj->sd);
 	if (dir_sd) {
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 25c78f23dae8..21102158ca33 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -207,7 +207,7 @@ void sysfs_remove_group(struct kobject *kobj,
 	struct sysfs_dirent *sd;
 
 	if (grp->name) {
-		sd = sysfs_get_dirent(dir_sd, NULL, grp->name);
+		sd = sysfs_get_dirent(dir_sd, grp->name);
 		if (!sd) {
 			WARN(!sd, KERN_WARNING
 			     "sysfs group %p not found for kobject '%s'\n",
@@ -262,7 +262,7 @@ int sysfs_merge_group(struct kobject *kobj,
 	struct attribute *const *attr;
 	int i;
 
-	dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
+	dir_sd = sysfs_get_dirent(kobj->sd, grp->name);
 	if (!dir_sd)
 		return -ENOENT;
 
@@ -289,7 +289,7 @@ void sysfs_unmerge_group(struct kobject *kobj,
 	struct sysfs_dirent *dir_sd;
 	struct attribute *const *attr;
 
-	dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
+	dir_sd = sysfs_get_dirent(kobj->sd, grp->name);
 	if (dir_sd) {
 		for (attr = grp->attrs; *attr; ++attr)
 			sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
@@ -311,7 +311,7 @@ int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name,
 	struct sysfs_dirent *dir_sd;
 	int error = 0;
 
-	dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name);
+	dir_sd = sysfs_get_dirent(kobj->sd, group_name);
 	if (!dir_sd)
 		return -ENOENT;
 
@@ -333,7 +333,7 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
 {
 	struct sysfs_dirent *dir_sd;
 
-	dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name);
+	dir_sd = sysfs_get_dirent(kobj->sd, group_name);
 	if (dir_sd) {
 		sysfs_hash_and_remove(dir_sd, NULL, link_name);
 		sysfs_put(dir_sd);
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 7d981ce2e87f..c96b31a16485 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -191,7 +191,7 @@ int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
 		old_ns = targ->sd->s_ns;
 
 	result = -ENOENT;
-	sd = sysfs_get_dirent(parent_sd, old_ns, old);
+	sd = sysfs_get_dirent_ns(parent_sd, old, old_ns);
 	if (!sd)
 		goto out;
 
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 7664d1b3d594..6faacafda777 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -164,9 +164,6 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
 struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 				       const void *ns,
 				       const unsigned char *name);
-struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
-				      const void *ns,
-				      const unsigned char *name);
 struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
 
 void release_sysfs_dirent(struct sysfs_dirent *sd);
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index c792f73ac7fa..6695040a0317 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -245,9 +245,9 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
 
 void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr);
 void sysfs_notify_dirent(struct sysfs_dirent *sd);
-struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
-				      const void *ns,
-				      const unsigned char *name);
+struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd,
+					 const unsigned char *name,
+					 const void *ns);
 struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd);
 void sysfs_put(struct sysfs_dirent *sd);
 
@@ -422,10 +422,9 @@ static inline void sysfs_notify(struct kobject *kobj, const char *dir,
 static inline void sysfs_notify_dirent(struct sysfs_dirent *sd)
 {
 }
-static inline
-struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
-				      const void *ns,
-				      const unsigned char *name)
+static inline struct sysfs_dirent *
+sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd, const unsigned char *name,
+		    const void *ns)
 {
 	return NULL;
 }
@@ -462,4 +461,10 @@ static inline int sysfs_rename_link(struct kobject *kobj, struct kobject *target
 	return sysfs_rename_link_ns(kobj, target, old_name, new_name, NULL);
 }
 
+static inline struct sysfs_dirent *
+sysfs_get_dirent(struct sysfs_dirent *parent_sd, const unsigned char *name)
+{
+	return sysfs_get_dirent_ns(parent_sd, name, NULL);
+}
+
 #endif /* _SYSFS_H_ */
-- 
cgit v1.3


From cfec0bc835c84d3d3723d4955587f05a94879b26 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 11 Sep 2013 22:29:09 -0400
Subject: sysfs: @name comes before @ns

Some internal sysfs functions which take explicit namespace argument
are weird in that they place the optional @ns in front of @name which
is contrary to the established convention.  This is confusing and
error-prone especially as @ns and @name may be interchanged without
causing compilation warning.

Swap the positions of @name and @ns in the following internal
functions.

 sysfs_find_dirent()
 sysfs_rename()
 sysfs_hash_and_remove()
 sysfs_name_hash()
 sysfs_name_compare()
 create_dir()

This patch doesn't introduce any functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Kay Sievers <kay@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/bin.c     |  2 +-
 fs/sysfs/dir.c     | 45 +++++++++++++++++++++++----------------------
 fs/sysfs/file.c    | 10 +++++-----
 fs/sysfs/group.c   | 12 ++++++------
 fs/sysfs/inode.c   |  6 +++---
 fs/sysfs/symlink.c |  6 +++---
 fs/sysfs/sysfs.h   | 10 +++++-----
 7 files changed, 46 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index c590cabd57bb..d49e6ca3345b 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -497,6 +497,6 @@ EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
 void sysfs_remove_bin_file(struct kobject *kobj,
 			   const struct bin_attribute *attr)
 {
-	sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name);
+	sysfs_hash_and_remove(kobj->sd, attr->attr.name, NULL);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index fee19d16e4a2..d23e66dfba74 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -35,12 +35,12 @@ static DEFINE_IDA(sysfs_ino_ida);
 
 /**
  *	sysfs_name_hash
- *	@ns:   Namespace tag to hash
  *	@name: Null terminated string to hash
+ *	@ns:   Namespace tag to hash
  *
  *	Returns 31 bit hash of ns + name (so it fits in an off_t )
  */
-static unsigned int sysfs_name_hash(const void *ns, const char *name)
+static unsigned int sysfs_name_hash(const char *name, const void *ns)
 {
 	unsigned long hash = init_name_hash();
 	unsigned int len = strlen(name);
@@ -56,8 +56,8 @@ static unsigned int sysfs_name_hash(const void *ns, const char *name)
 	return hash;
 }
 
-static int sysfs_name_compare(unsigned int hash, const void *ns,
-	const char *name, const struct sysfs_dirent *sd)
+static int sysfs_name_compare(unsigned int hash, const char *name,
+			      const void *ns, const struct sysfs_dirent *sd)
 {
 	if (hash != sd->s_hash)
 		return hash - sd->s_hash;
@@ -69,7 +69,7 @@ static int sysfs_name_compare(unsigned int hash, const void *ns,
 static int sysfs_sd_compare(const struct sysfs_dirent *left,
 			    const struct sysfs_dirent *right)
 {
-	return sysfs_name_compare(left->s_hash, left->s_ns, left->s_name,
+	return sysfs_name_compare(left->s_hash, left->s_name, left->s_ns,
 				  right);
 }
 
@@ -451,7 +451,7 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 	struct sysfs_inode_attrs *ps_iattr;
 	int ret;
 
-	sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name);
+	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
 	sd->s_parent = sysfs_get(acxt->parent_sd);
 
 	ret = sysfs_link_sibling(sd);
@@ -596,6 +596,7 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
  *	sysfs_find_dirent - find sysfs_dirent with the given name
  *	@parent_sd: sysfs_dirent to search under
  *	@name: name to look for
+ *	@ns: the namespace tag to use
  *
  *	Look for sysfs_dirent with name @name under @parent_sd.
  *
@@ -606,19 +607,19 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
  *	Pointer to sysfs_dirent if found, NULL if not.
  */
 struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
-				       const void *ns,
-				       const unsigned char *name)
+				       const unsigned char *name,
+				       const void *ns)
 {
 	struct rb_node *node = parent_sd->s_dir.children.rb_node;
 	unsigned int hash;
 
-	hash = sysfs_name_hash(ns, name);
+	hash = sysfs_name_hash(name, ns);
 	while (node) {
 		struct sysfs_dirent *sd;
 		int result;
 
 		sd = to_sysfs_dirent(node);
-		result = sysfs_name_compare(hash, ns, name, sd);
+		result = sysfs_name_compare(hash, name, ns, sd);
 		if (result < 0)
 			node = node->rb_left;
 		else if (result > 0)
@@ -651,7 +652,7 @@ struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd,
 	struct sysfs_dirent *sd;
 
 	mutex_lock(&sysfs_mutex);
-	sd = sysfs_find_dirent(parent_sd, ns, name);
+	sd = sysfs_find_dirent(parent_sd, name, ns);
 	sysfs_get(sd);
 	mutex_unlock(&sysfs_mutex);
 
@@ -660,7 +661,8 @@ struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd,
 EXPORT_SYMBOL_GPL(sysfs_get_dirent_ns);
 
 static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
-	const void *ns, const char *name, struct sysfs_dirent **p_sd)
+		      const char *name, const void *ns,
+		      struct sysfs_dirent **p_sd)
 {
 	umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
 	struct sysfs_addrm_cxt acxt;
@@ -691,7 +693,7 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 int sysfs_create_subdir(struct kobject *kobj, const char *name,
 			struct sysfs_dirent **p_sd)
 {
-	return create_dir(kobj, kobj->sd, NULL, name, p_sd);
+	return create_dir(kobj, kobj->sd, name, NULL, p_sd);
 }
 
 /**
@@ -714,7 +716,7 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 	if (!parent_sd)
 		return -ENOENT;
 
-	error = create_dir(kobj, parent_sd, ns, kobject_name(kobj), &sd);
+	error = create_dir(kobj, parent_sd, kobject_name(kobj), ns, &sd);
 	if (!error)
 		kobj->sd = sd;
 	return error;
@@ -735,7 +737,7 @@ static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
 	if (parent_sd->s_flags & SYSFS_FLAG_HAS_NS)
 		ns = sysfs_info(dir->i_sb)->ns;
 
-	sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name);
+	sd = sysfs_find_dirent(parent_sd, dentry->d_name.name, ns);
 
 	/* no such entry */
 	if (!sd) {
@@ -823,9 +825,8 @@ void sysfs_remove_dir(struct kobject *kobj)
 	__sysfs_remove_dir(sd);
 }
 
-int sysfs_rename(struct sysfs_dirent *sd,
-	struct sysfs_dirent *new_parent_sd, const void *new_ns,
-	const char *new_name)
+int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
+		 const char *new_name, const void *new_ns)
 {
 	int error;
 
@@ -837,7 +838,7 @@ int sysfs_rename(struct sysfs_dirent *sd,
 		goto out;	/* nothing to rename */
 
 	error = -EEXIST;
-	if (sysfs_find_dirent(new_parent_sd, new_ns, new_name))
+	if (sysfs_find_dirent(new_parent_sd, new_name, new_ns))
 		goto out;
 
 	/* rename sysfs_dirent */
@@ -858,7 +859,7 @@ int sysfs_rename(struct sysfs_dirent *sd,
 	sysfs_get(new_parent_sd);
 	sysfs_put(sd->s_parent);
 	sd->s_ns = new_ns;
-	sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name);
+	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
 	sd->s_parent = new_parent_sd;
 	sysfs_link_sibling(sd);
 
@@ -873,7 +874,7 @@ int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
 {
 	struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
 
-	return sysfs_rename(kobj->sd, parent_sd, new_ns, new_name);
+	return sysfs_rename(kobj->sd, parent_sd, new_name, new_ns);
 }
 
 int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
@@ -886,7 +887,7 @@ int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
 	new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
 		new_parent_kobj->sd : &sysfs_root;
 
-	return sysfs_rename(sd, new_parent_sd, new_ns, sd->s_name);
+	return sysfs_rename(sd, new_parent_sd, sd->s_name, new_ns);
 }
 
 /* Relationship between s_mode and the DT_xxx types */
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 0f3214a70985..4697019fafa3 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -466,9 +466,9 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
 	mutex_lock(&sysfs_mutex);
 
 	if (sd && dir)
-		sd = sysfs_find_dirent(sd, NULL, dir);
+		sd = sysfs_find_dirent(sd, dir, NULL);
 	if (sd && attr)
-		sd = sysfs_find_dirent(sd, NULL, attr);
+		sd = sysfs_find_dirent(sd, attr, NULL);
 	if (sd)
 		sysfs_notify_dirent(sd);
 
@@ -594,7 +594,7 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
 	mutex_lock(&sysfs_mutex);
 
 	rc = -ENOENT;
-	sd = sysfs_find_dirent(kobj->sd, NULL, attr->name);
+	sd = sysfs_find_dirent(kobj->sd, attr->name, NULL);
 	if (!sd)
 		goto out;
 
@@ -621,7 +621,7 @@ void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
 {
 	struct sysfs_dirent *dir_sd = kobj->sd;
 
-	sysfs_hash_and_remove(dir_sd, ns, attr->name);
+	sysfs_hash_and_remove(dir_sd, attr->name, ns);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_file_ns);
 
@@ -649,7 +649,7 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
 	else
 		dir_sd = sysfs_get(kobj->sd);
 	if (dir_sd) {
-		sysfs_hash_and_remove(dir_sd, NULL, attr->name);
+		sysfs_hash_and_remove(dir_sd, attr->name, NULL);
 		sysfs_put(dir_sd);
 	}
 }
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 21102158ca33..2dae55c4f7dc 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -26,7 +26,7 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 
 	if (grp->attrs)
 		for (attr = grp->attrs; *attr; attr++)
-			sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+			sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL);
 	if (grp->bin_attrs)
 		for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
 			sysfs_remove_bin_file(kobj, *bin_attr);
@@ -49,8 +49,8 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 			 * re-adding (if required) the file.
 			 */
 			if (update)
-				sysfs_hash_and_remove(dir_sd, NULL,
-						      (*attr)->name);
+				sysfs_hash_and_remove(dir_sd, (*attr)->name,
+						      NULL);
 			if (grp->is_visible) {
 				mode = grp->is_visible(kobj, *attr, i);
 				if (!mode)
@@ -270,7 +270,7 @@ int sysfs_merge_group(struct kobject *kobj,
 		error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
 	if (error) {
 		while (--i >= 0)
-			sysfs_hash_and_remove(dir_sd, NULL, (*--attr)->name);
+			sysfs_hash_and_remove(dir_sd, (*--attr)->name, NULL);
 	}
 	sysfs_put(dir_sd);
 
@@ -292,7 +292,7 @@ void sysfs_unmerge_group(struct kobject *kobj,
 	dir_sd = sysfs_get_dirent(kobj->sd, grp->name);
 	if (dir_sd) {
 		for (attr = grp->attrs; *attr; ++attr)
-			sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+			sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL);
 		sysfs_put(dir_sd);
 	}
 }
@@ -335,7 +335,7 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
 
 	dir_sd = sysfs_get_dirent(kobj->sd, group_name);
 	if (dir_sd) {
-		sysfs_hash_and_remove(dir_sd, NULL, link_name);
+		sysfs_hash_and_remove(dir_sd, link_name, NULL);
 		sysfs_put(dir_sd);
 	}
 }
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 963f910c8034..07193d720d92 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -314,8 +314,8 @@ void sysfs_evict_inode(struct inode *inode)
 	sysfs_put(sd);
 }
 
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns,
-			  const char *name)
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
+			  const void *ns)
 {
 	struct sysfs_addrm_cxt acxt;
 	struct sysfs_dirent *sd;
@@ -328,7 +328,7 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns,
 
 	sysfs_addrm_start(&acxt, dir_sd);
 
-	sd = sysfs_find_dirent(dir_sd, ns, name);
+	sd = sysfs_find_dirent(dir_sd, name, ns);
 	if (sd)
 		sysfs_remove_one(&acxt, sd);
 
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index c96b31a16485..88c8bc5e8911 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -144,7 +144,7 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
 	if (targ->sd)
 		ns = targ->sd->s_ns;
 	spin_unlock(&sysfs_assoc_lock);
-	sysfs_hash_and_remove(kobj->sd, ns, name);
+	sysfs_hash_and_remove(kobj->sd, name, ns);
 }
 
 /**
@@ -161,7 +161,7 @@ void sysfs_remove_link(struct kobject *kobj, const char *name)
 	else
 		parent_sd = kobj->sd;
 
-	sysfs_hash_and_remove(parent_sd, NULL, name);
+	sysfs_hash_and_remove(parent_sd, name, NULL);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_link);
 
@@ -201,7 +201,7 @@ int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
 	if (sd->s_symlink.target_sd->s_dir.kobj != targ)
 		goto out;
 
-	result = sysfs_rename(sd, parent_sd, new_ns, new);
+	result = sysfs_rename(sd, parent_sd, new, new_ns);
 
 out:
 	sysfs_put(sd);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 6faacafda777..ee44fde199d0 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -162,8 +162,8 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
 
 struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
-				       const void *ns,
-				       const unsigned char *name);
+				       const unsigned char *name,
+				       const void *ns);
 struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
 
 void release_sysfs_dirent(struct sysfs_dirent *sd);
@@ -173,7 +173,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
 void sysfs_remove_subdir(struct sysfs_dirent *sd);
 
 int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
-		 const void *ns, const char *new_name);
+		 const char *new_name, const void *new_ns);
 
 static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
 {
@@ -204,8 +204,8 @@ int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat);
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		   size_t size, int flags);
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns,
-			  const char *name);
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
+			  const void *ns);
 int sysfs_inode_init(void);
 
 /*
-- 
cgit v1.3


From af5c269799feaef110e59ce55b497cdd08712b0c Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 27 Sep 2013 12:49:33 +0100
Subject: GFS2: Clean up reservation removal

The reservation for an inode should be cleared when it is truncated so
that we can start again at a different offset for future allocations.
We could try and do better than that, by resetting the search based on
where the truncation started from, but this is only a first step.

In addition, there are three callers of gfs2_rs_delete() but only one
of those should really be testing the value of i_writecount. While
we get away with that in the other cases currently, I think it would
be better if we made that test specific to the one case which
requires it.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c  | 4 +++-
 fs/gfs2/file.c  | 2 +-
 fs/gfs2/inode.c | 2 +-
 fs/gfs2/rgrp.c  | 7 +++----
 fs/gfs2/rgrp.h  | 2 +-
 fs/gfs2/super.c | 2 +-
 6 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 62a65fc448dc..21ad0f11cad4 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1279,6 +1279,7 @@ do_grow_qunlock:
 
 int gfs2_setattr_size(struct inode *inode, u64 newsize)
 {
+	struct gfs2_inode *ip = GFS2_I(inode);
 	int ret;
 	u64 oldsize;
 
@@ -1294,7 +1295,7 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
 
 	inode_dio_wait(inode);
 
-	ret = gfs2_rs_alloc(GFS2_I(inode));
+	ret = gfs2_rs_alloc(ip);
 	if (ret)
 		goto out;
 
@@ -1304,6 +1305,7 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
 		goto out;
 	}
 
+	gfs2_rs_deltree(ip->i_res);
 	ret = do_shrink(inode, oldsize, newsize);
 out:
 	put_write_access(inode);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 0621b46d474d..9ad20edc9c27 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -620,7 +620,7 @@ static int gfs2_release(struct inode *inode, struct file *file)
 	if (!(file->f_mode & FMODE_WRITE))
 		return 0;
 
-	gfs2_rs_delete(ip);
+	gfs2_rs_delete(ip, &inode->i_writecount);
 	return 0;
 }
 
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index cd58611912f5..4b79c19100d2 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -711,7 +711,7 @@ fail_gunlock2:
 fail_free_inode:
 	if (ip->i_gl)
 		gfs2_glock_put(ip->i_gl);
-	gfs2_rs_delete(ip);
+	gfs2_rs_delete(ip, NULL);
 	free_inode_nonrcu(inode);
 	inode = NULL;
 fail_gunlock:
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 285dd363199a..d4d10fadab79 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -661,14 +661,13 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
 /**
  * gfs2_rs_delete - delete a multi-block reservation
  * @ip: The inode for this reservation
+ * @wcount: The inode's write count, or NULL
  *
  */
-void gfs2_rs_delete(struct gfs2_inode *ip)
+void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount)
 {
-	struct inode *inode = &ip->i_inode;
-
 	down_write(&ip->i_rw_mutex);
-	if (ip->i_res && atomic_read(&inode->i_writecount) <= 1) {
+	if (ip->i_res && ((wcount == NULL) || (atomic_read(wcount) <= 1))) {
 		gfs2_rs_deltree(ip->i_res);
 		BUG_ON(ip->i_res->rs_free);
 		kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 5b3f4a896e6c..57ea16ba3414 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -48,7 +48,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
 
 extern int gfs2_rs_alloc(struct gfs2_inode *ip);
 extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
-extern void gfs2_rs_delete(struct gfs2_inode *ip);
+extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount);
 extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
 extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index e5639dec66c4..35da5b19c0de 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1526,7 +1526,7 @@ out_unlock:
 out:
 	/* Case 3 starts here */
 	truncate_inode_pages(&inode->i_data, 0);
-	gfs2_rs_delete(ip);
+	gfs2_rs_delete(ip, NULL);
 	gfs2_ordered_del_inode(ip);
 	clear_inode(inode);
 	gfs2_dir_hash_inval(ip);
-- 
cgit v1.3


From 8fb883f3e30065529e4f35d4b4f355193dcdb7a2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 21 Sep 2013 00:09:31 +0100
Subject: FS-Cache: Add use/unuse/wake cookie wrappers

Add wrapper functions for dealing with cookie->n_active:

 (*) __fscache_use_cookie() to increment it.

 (*) __fscache_unuse_cookie() to decrement and test against zero.

 (*) __fscache_wake_unused_cookie() to wake up anyone waiting for it to reach
     zero.

The second and third are split so that the third can be done after cookie->lock
has been released in case the waiter wakes up whilst we're still holding it and
tries to get it.

We will need to wake-on-zero once the cookie disablement patch is applied
because it will then be possible to see n_active become zero without the cookie
being relinquished.

Also move the cookie usement out of fscache_attr_changed_op() and into
fscache_attr_changed() and the operation struct so that cookie disablement
will be able to track it.

Whilst we're at it, only increment n_active if we're about to do
fscache_submit_op() so that we don't have to deal with undoing it if anything
earlier fails.  Possibly this should be moved into fscache_submit_op() which
could look at FSCACHE_OP_UNUSE_COOKIE.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/fscache/cookie.c           |  7 +++++--
 fs/fscache/page.c             | 42 +++++++++++++++++++++++++++++++-----------
 include/linux/fscache-cache.h | 19 +++++++++++++++++--
 3 files changed, 53 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index b2a86e324aac..d851aa555d28 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -568,6 +568,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
 {
 	struct fscache_operation *op;
 	struct fscache_object *object;
+	bool wake_cookie = false;
 	int ret;
 
 	_enter("%p,", cookie);
@@ -600,7 +601,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
 
 	op->debug_id = atomic_inc_return(&fscache_op_debug_id);
 
-	atomic_inc(&cookie->n_active);
+	__fscache_use_cookie(cookie);
 	if (fscache_submit_op(object, op) < 0)
 		goto submit_failed;
 
@@ -622,9 +623,11 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
 	return ret;
 
 submit_failed:
-	atomic_dec(&cookie->n_active);
+	wake_cookie = __fscache_unuse_cookie(cookie);
 inconsistent:
 	spin_unlock(&cookie->lock);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
 	kfree(op);
 	_leave(" = -ESTALE");
 	return -ESTALE;
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 73899c1c3449..0fe42a6d0e9c 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -163,12 +163,10 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
 
 	fscache_stat(&fscache_n_attr_changed_calls);
 
-	if (fscache_object_is_active(object) &&
-	    fscache_use_cookie(object)) {
+	if (fscache_object_is_active(object)) {
 		fscache_stat(&fscache_n_cop_attr_changed);
 		ret = object->cache->ops->attr_changed(object);
 		fscache_stat_d(&fscache_n_cop_attr_changed);
-		fscache_unuse_cookie(object);
 		if (ret < 0)
 			fscache_abort_object(object);
 	}
@@ -184,6 +182,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 {
 	struct fscache_operation *op;
 	struct fscache_object *object;
+	bool wake_cookie;
 
 	_enter("%p", cookie);
 
@@ -199,7 +198,9 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 	}
 
 	fscache_operation_init(op, fscache_attr_changed_op, NULL);
-	op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
+	op->flags = FSCACHE_OP_ASYNC |
+		(1 << FSCACHE_OP_EXCLUSIVE) |
+		(1 << FSCACHE_OP_UNUSE_COOKIE);
 
 	spin_lock(&cookie->lock);
 
@@ -208,6 +209,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
 
+	__fscache_use_cookie(cookie);
 	if (fscache_submit_exclusive_op(object, op) < 0)
 		goto nobufs;
 	spin_unlock(&cookie->lock);
@@ -217,8 +219,11 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 	return 0;
 
 nobufs:
+	wake_cookie = __fscache_unuse_cookie(cookie);
 	spin_unlock(&cookie->lock);
 	kfree(op);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
 	fscache_stat(&fscache_n_attr_changed_nobufs);
 	_leave(" = %d", -ENOBUFS);
 	return -ENOBUFS;
@@ -263,7 +268,6 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
 	}
 
 	fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
-	atomic_inc(&cookie->n_active);
 	op->op.flags	= FSCACHE_OP_MYTHREAD |
 		(1UL << FSCACHE_OP_WAITING) |
 		(1UL << FSCACHE_OP_UNUSE_COOKIE);
@@ -384,6 +388,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 {
 	struct fscache_retrieval *op;
 	struct fscache_object *object;
+	bool wake_cookie = false;
 	int ret;
 
 	_enter("%p,%p,,,", cookie, page);
@@ -421,6 +426,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 
 	ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags));
 
+	__fscache_use_cookie(cookie);
 	atomic_inc(&object->n_reads);
 	__set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
 
@@ -475,9 +481,11 @@ error:
 
 nobufs_unlock_dec:
 	atomic_dec(&object->n_reads);
+	wake_cookie = __fscache_unuse_cookie(cookie);
 nobufs_unlock:
 	spin_unlock(&cookie->lock);
-	atomic_dec(&cookie->n_active);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
 	kfree(op);
 nobufs:
 	fscache_stat(&fscache_n_retrievals_nobufs);
@@ -514,6 +522,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 {
 	struct fscache_retrieval *op;
 	struct fscache_object *object;
+	bool wake_cookie = false;
 	int ret;
 
 	_enter("%p,,%d,,,", cookie, *nr_pages);
@@ -547,6 +556,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
 
+	__fscache_use_cookie(cookie);
 	atomic_inc(&object->n_reads);
 	__set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
 
@@ -601,10 +611,12 @@ error:
 
 nobufs_unlock_dec:
 	atomic_dec(&object->n_reads);
+	wake_cookie = __fscache_unuse_cookie(cookie);
 nobufs_unlock:
 	spin_unlock(&cookie->lock);
-	atomic_dec(&cookie->n_active);
 	kfree(op);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
 nobufs:
 	fscache_stat(&fscache_n_retrievals_nobufs);
 	_leave(" = -ENOBUFS");
@@ -626,6 +638,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
 {
 	struct fscache_retrieval *op;
 	struct fscache_object *object;
+	bool wake_cookie = false;
 	int ret;
 
 	_enter("%p,%p,,,", cookie, page);
@@ -658,8 +671,9 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
 
+	__fscache_use_cookie(cookie);
 	if (fscache_submit_op(object, &op->op) < 0)
-		goto nobufs_unlock;
+		goto nobufs_unlock_dec;
 	spin_unlock(&cookie->lock);
 
 	fscache_stat(&fscache_n_alloc_ops);
@@ -689,10 +703,13 @@ error:
 	_leave(" = %d", ret);
 	return ret;
 
+nobufs_unlock_dec:
+	wake_cookie = __fscache_unuse_cookie(cookie);
 nobufs_unlock:
 	spin_unlock(&cookie->lock);
-	atomic_dec(&cookie->n_active);
 	kfree(op);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
 nobufs:
 	fscache_stat(&fscache_n_allocs_nobufs);
 	_leave(" = -ENOBUFS");
@@ -889,6 +906,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 {
 	struct fscache_storage *op;
 	struct fscache_object *object;
+	bool wake_cookie = false;
 	int ret;
 
 	_enter("%p,%x,", cookie, (u32) page->flags);
@@ -957,7 +975,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	op->op.debug_id	= atomic_inc_return(&fscache_op_debug_id);
 	op->store_limit = object->store_limit;
 
-	atomic_inc(&cookie->n_active);
+	__fscache_use_cookie(cookie);
 	if (fscache_submit_op(object, &op->op) < 0)
 		goto submit_failed;
 
@@ -984,10 +1002,10 @@ already_pending:
 	return 0;
 
 submit_failed:
-	atomic_dec(&cookie->n_active);
 	spin_lock(&cookie->stores_lock);
 	radix_tree_delete(&cookie->stores, page->index);
 	spin_unlock(&cookie->stores_lock);
+	wake_cookie = __fscache_unuse_cookie(cookie);
 	page_cache_release(page);
 	ret = -ENOBUFS;
 	goto nobufs;
@@ -999,6 +1017,8 @@ nobufs:
 	spin_unlock(&cookie->lock);
 	radix_tree_preload_end();
 	kfree(op);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
 	fscache_stat(&fscache_n_stores_nobufs);
 	_leave(" = -ENOBUFS");
 	return -ENOBUFS;
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 7823e9ef995e..96a2b66f5968 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -511,6 +511,11 @@ static inline void fscache_end_io(struct fscache_retrieval *op,
 	op->end_io_func(page, op->context, error);
 }
 
+static inline void __fscache_use_cookie(struct fscache_cookie *cookie)
+{
+	atomic_inc(&cookie->n_active);
+}
+
 /**
  * fscache_use_cookie - Request usage of cookie attached to an object
  * @object: Object description
@@ -524,6 +529,16 @@ static inline bool fscache_use_cookie(struct fscache_object *object)
 	return atomic_inc_not_zero(&cookie->n_active) != 0;
 }
 
+static inline bool __fscache_unuse_cookie(struct fscache_cookie *cookie)
+{
+	return atomic_dec_and_test(&cookie->n_active);
+}
+
+static inline void __fscache_wake_unused_cookie(struct fscache_cookie *cookie)
+{
+	wake_up_atomic_t(&cookie->n_active);
+}
+
 /**
  * fscache_unuse_cookie - Cease usage of cookie attached to an object
  * @object: Object description
@@ -534,8 +549,8 @@ static inline bool fscache_use_cookie(struct fscache_object *object)
 static inline void fscache_unuse_cookie(struct fscache_object *object)
 {
 	struct fscache_cookie *cookie = object->cookie;
-	if (atomic_dec_and_test(&cookie->n_active))
-		wake_up_atomic_t(&cookie->n_active);
+	if (__fscache_unuse_cookie(cookie))
+		__fscache_wake_unused_cookie(cookie);
 }
 
 /*
-- 
cgit v1.3


From 94d30ae90a00cafe686c1057be57f4885f963abf Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 21 Sep 2013 00:09:31 +0100
Subject: FS-Cache: Provide the ability to enable/disable cookies

Provide the ability to enable and disable fscache cookies.  A disabled cookie
will reject or ignore further requests to:

	Acquire a child cookie
	Invalidate and update backing objects
	Check the consistency of a backing object
	Allocate storage for backing page
	Read backing pages
	Write to backing pages

but still allows:

	Checks/waits on the completion of already in-progress objects
	Uncaching of pages
	Relinquishment of cookies

Two new operations are provided:

 (1) Disable a cookie:

	void fscache_disable_cookie(struct fscache_cookie *cookie,
				    bool invalidate);

     If the cookie is not already disabled, this locks the cookie against other
     dis/enablement ops, marks the cookie as being disabled, discards or
     invalidates any backing objects and waits for cessation of activity on any
     associated object.

     This is a wrapper around a chunk split out of fscache_relinquish_cookie(),
     but it reinitialises the cookie such that it can be reenabled.

     All possible failures are handled internally.  The caller should consider
     calling fscache_uncache_all_inode_pages() afterwards to make sure all page
     markings are cleared up.

 (2) Enable a cookie:

	void fscache_enable_cookie(struct fscache_cookie *cookie,
				   bool (*can_enable)(void *data),
				   void *data)

     If the cookie is not already enabled, this locks the cookie against other
     dis/enablement ops, invokes can_enable() and, if the cookie is not an
     index cookie, will begin the procedure of acquiring backing objects.

     The optional can_enable() function is passed the data argument and returns
     a ruling as to whether or not enablement should actually be permitted to
     begin.

     All possible failures are handled internally.  The cookie will only be
     marked as enabled if provisional backing objects are allocated.

A later patch will introduce these to NFS.  Cookie enablement during nfs_open()
is then contingent on i_writecount <= 0.  can_enable() checks for a race
between open(O_RDONLY) and open(O_WRONLY/O_RDWR).  This simplifies NFS's cookie
handling and allows us to get rid of open(O_RDONLY) accidentally introducing
caching to an inode that's open for writing already.

One operation has its API modified:

 (3) Acquire a cookie.

	struct fscache_cookie *fscache_acquire_cookie(
		struct fscache_cookie *parent,
		const struct fscache_cookie_def *def,
		void *netfs_data,
		bool enable);

     This now has an additional argument that indicates whether the requested
     cookie should be enabled by default.  It doesn't need the can_enable()
     function because the caller must prevent multiple calls for the same netfs
     object and it doesn't need to take the enablement lock because no one else
     can get at the cookie before this returns.

Signed-off-by: David Howells <dhowells@redhat.com
---
 Documentation/filesystems/caching/netfs-api.txt |  73 ++++++++--
 fs/9p/cache.c                                   |   6 +-
 fs/afs/cell.c                                   |   2 +-
 fs/afs/inode.c                                  |   2 +-
 fs/afs/vlocation.c                              |   3 +-
 fs/afs/volume.c                                 |   2 +-
 fs/cachefiles/interface.c                       |   2 +-
 fs/ceph/cache.c                                 |   4 +-
 fs/cifs/fscache.c                               |   8 +-
 fs/fscache/cookie.c                             | 186 +++++++++++++++++-------
 fs/fscache/fsdef.c                              |   1 +
 fs/fscache/netfs.c                              |   1 +
 fs/fscache/object.c                             |   7 +-
 fs/fscache/page.c                               |  17 ++-
 fs/nfs/fscache.c                                |   8 +-
 include/linux/fscache-cache.h                   |  31 +---
 include/linux/fscache.h                         | 113 ++++++++++++--
 17 files changed, 334 insertions(+), 132 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
index 11a0a40ce445..aed6b94160b1 100644
--- a/Documentation/filesystems/caching/netfs-api.txt
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -29,15 +29,16 @@ This document contains the following sections:
 	 (6) Index registration
 	 (7) Data file registration
 	 (8) Miscellaneous object registration
-	 (9) Setting the data file size
+ 	 (9) Setting the data file size
 	(10) Page alloc/read/write
 	(11) Page uncaching
 	(12) Index and data file consistency
-	(13) Miscellaneous cookie operations
-	(14) Cookie unregistration
-	(15) Index invalidation
-	(16) Data file invalidation
-	(17) FS-Cache specific page flags.
+	(13) Cookie enablement
+	(14) Miscellaneous cookie operations
+	(15) Cookie unregistration
+	(16) Index invalidation
+	(17) Data file invalidation
+	(18) FS-Cache specific page flags.
 
 
 =============================
@@ -334,7 +335,8 @@ the path to the file:
 	struct fscache_cookie *
 	fscache_acquire_cookie(struct fscache_cookie *parent,
 			       const struct fscache_object_def *def,
-			       void *netfs_data);
+			       void *netfs_data,
+			       bool enable);
 
 This function creates an index entry in the index represented by parent,
 filling in the index entry by calling the operations pointed to by def.
@@ -350,6 +352,10 @@ object needs to be created somewhere down the hierarchy.  Furthermore, an index
 may be created in several different caches independently at different times.
 This is all handled transparently, and the netfs doesn't see any of it.
 
+A cookie will be created in the disabled state if enabled is false.  A cookie
+must be enabled to do anything with it.  A disabled cookie can be enabled by
+calling fscache_enable_cookie() (see below).
+
 For example, with AFS, a cell would be added to the primary index.  This index
 entry would have a dependent inode containing a volume location index for the
 volume mappings within this cell:
@@ -357,7 +363,7 @@ volume mappings within this cell:
 	cell->cache =
 		fscache_acquire_cookie(afs_cache_netfs.primary_index,
 				       &afs_cell_cache_index_def,
-				       cell);
+				       cell, true);
 
 Then when a volume location was accessed, it would be entered into the cell's
 index and an inode would be allocated that acts as a volume type and hash chain
@@ -366,7 +372,7 @@ combination:
 	vlocation->cache =
 		fscache_acquire_cookie(cell->cache,
 				       &afs_vlocation_cache_index_def,
-				       vlocation);
+				       vlocation, true);
 
 And then a particular flavour of volume (R/O for example) could be added to
 that index, creating another index for vnodes (AFS inode equivalents):
@@ -374,7 +380,7 @@ that index, creating another index for vnodes (AFS inode equivalents):
 	volume->cache =
 		fscache_acquire_cookie(vlocation->cache,
 				       &afs_volume_cache_index_def,
-				       volume);
+				       volume, true);
 
 
 ======================
@@ -388,7 +394,7 @@ the object definition should be something other than index type.
 	vnode->cache =
 		fscache_acquire_cookie(volume->cache,
 				       &afs_vnode_cache_object_def,
-				       vnode);
+				       vnode, true);
 
 
 =================================
@@ -404,7 +410,7 @@ it would be some other type of object such as a data file.
 	xattr->cache =
 		fscache_acquire_cookie(vnode->cache,
 				       &afs_xattr_cache_object_def,
-				       xattr);
+				       xattr, true);
 
 Miscellaneous objects might be used to store extended attributes or directory
 entries for example.
@@ -733,6 +739,47 @@ Note that partial updates may happen automatically at other times, such as when
 data blocks are added to a data file object.
 
 
+=================
+COOKIE ENABLEMENT
+=================
+
+Cookies exist in one of two states: enabled and disabled.  If a cookie is
+disabled, it ignores all attempts to acquire child cookies; check, update or
+invalidate its state; allocate, read or write backing pages - though it is
+still possible to uncache pages and relinquish the cookie.
+
+The initial enablement state is set by fscache_acquire_cookie(), but the cookie
+can be enabled or disabled later.  To disable a cookie, call:
+    
+	void fscache_disable_cookie(struct fscache_cookie *cookie,
+    				    bool invalidate);
+    
+If the cookie is not already disabled, this locks the cookie against other
+enable and disable ops, marks the cookie as being disabled, discards or
+invalidates any backing objects and waits for cessation of activity on any
+associated object before unlocking the cookie.
+
+All possible failures are handled internally.  The caller should consider
+calling fscache_uncache_all_inode_pages() afterwards to make sure all page
+markings are cleared up.
+    
+Cookies can be enabled or reenabled with:
+    
+    	void fscache_enable_cookie(struct fscache_cookie *cookie,
+    				   bool (*can_enable)(void *data),
+    				   void *data)
+    
+If the cookie is not already enabled, this locks the cookie against other
+enable and disable ops, invokes can_enable() and, if the cookie is not an index
+cookie, will begin the procedure of acquiring backing objects.
+
+The optional can_enable() function is passed the data argument and returns a
+ruling as to whether or not enablement should actually be permitted to begin.
+
+All possible failures are handled internally.  The cookie will only be marked
+as enabled if provisional backing objects are allocated.
+
+
 ===============================
 MISCELLANEOUS COOKIE OPERATIONS
 ===============================
@@ -778,7 +825,7 @@ COOKIE UNREGISTRATION
 To get rid of a cookie, this function should be called.
 
 	void fscache_relinquish_cookie(struct fscache_cookie *cookie,
-				       int retire);
+				       bool retire);
 
 If retire is non-zero, then the object will be marked for recycling, and all
 copies of it will be removed from all active caches in which it is present.
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index a9ea73d6dcf3..2b7a032c37bc 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -90,7 +90,7 @@ void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
 
 	v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index,
 						&v9fs_cache_session_index_def,
-						v9ses);
+						v9ses, true);
 	p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n",
 		 v9ses, v9ses->fscache);
 }
@@ -204,7 +204,7 @@ void v9fs_cache_inode_get_cookie(struct inode *inode)
 	v9ses = v9fs_inode2v9ses(inode);
 	v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
 						  &v9fs_cache_inode_index_def,
-						  v9inode);
+						  v9inode, true);
 
 	p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
 		 inode, v9inode->fscache);
@@ -271,7 +271,7 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
 	v9ses = v9fs_inode2v9ses(inode);
 	v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
 						  &v9fs_cache_inode_index_def,
-						  v9inode);
+						  v9inode, true);
 	p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n",
 		 inode, old, v9inode->fscache);
 
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 3c090b7555ea..ca0a3cf93791 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -179,7 +179,7 @@ struct afs_cell *afs_cell_create(const char *name, unsigned namesz,
 	/* put it up for caching (this never returns an error) */
 	cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
 					     &afs_cell_cache_index_def,
-					     cell);
+					     cell, true);
 #endif
 
 	/* add to the cell lists */
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 789bc253b5f6..ce25d755b7aa 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -259,7 +259,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
 #ifdef CONFIG_AFS_FSCACHE
 	vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
 					      &afs_vnode_cache_index_def,
-					      vnode);
+					      vnode, true);
 #endif
 
 	ret = afs_inode_map_status(vnode, key);
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 57bcb1596530..b6df2e83809f 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -308,7 +308,8 @@ static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
 	/* see if we have an in-cache copy (will set vl->valid if there is) */
 #ifdef CONFIG_AFS_FSCACHE
 	vl->cache = fscache_acquire_cookie(vl->cell->cache,
-					   &afs_vlocation_cache_index_def, vl);
+					   &afs_vlocation_cache_index_def, vl,
+					   true);
 #endif
 
 	if (vl->valid) {
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 401eeb21869f..2b607257820c 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -131,7 +131,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
 #ifdef CONFIG_AFS_FSCACHE
 	volume->cache = fscache_acquire_cookie(vlocation->cache,
 					       &afs_volume_cache_index_def,
-					       volume);
+					       volume, true);
 #endif
 	afs_get_vlocation(vlocation);
 	volume->vlocation = vlocation;
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 43eb5592cdea..00baf1419989 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -270,7 +270,7 @@ static void cachefiles_drop_object(struct fscache_object *_object)
 #endif
 
 	/* delete retired objects */
-	if (test_bit(FSCACHE_COOKIE_RETIRED, &object->fscache.cookie->flags) &&
+	if (test_bit(FSCACHE_OBJECT_RETIRED, &object->fscache.flags) &&
 	    _object != cache->cache.fsdef
 	    ) {
 		_debug("- retire object OBJ%x", object->fscache.debug_id);
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 6bfe65e0b038..7db2e6ca4b8f 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -68,7 +68,7 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
 {
 	fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
 					      &ceph_fscache_fsid_object_def,
-					      fsc);
+					      fsc, true);
 
 	if (fsc->fscache == NULL) {
 		pr_err("Unable to resgister fsid: %p fscache cookie", fsc);
@@ -204,7 +204,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
 
 	ci->fscache = fscache_acquire_cookie(fsc->fscache,
 					     &ceph_fscache_inode_object_def,
-					     ci);
+					     ci, true);
 done:
 	mutex_unlock(&inode->i_mutex);
 
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 2f4bc5a58054..fe2492d2a8fc 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -27,7 +27,7 @@ void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server)
 {
 	server->fscache =
 		fscache_acquire_cookie(cifs_fscache_netfs.primary_index,
-				&cifs_fscache_server_index_def, server);
+				&cifs_fscache_server_index_def, server, true);
 	cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
 		 __func__, server, server->fscache);
 }
@@ -46,7 +46,7 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
 
 	tcon->fscache =
 		fscache_acquire_cookie(server->fscache,
-				&cifs_fscache_super_index_def, tcon);
+				&cifs_fscache_super_index_def, tcon, true);
 	cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
 		 __func__, server->fscache, tcon->fscache);
 }
@@ -69,7 +69,7 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) {
 		cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
-				&cifs_fscache_inode_object_def, cifsi);
+				&cifs_fscache_inode_object_def, cifsi, true);
 		cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n",
 			 __func__, tcon->fscache, cifsi->fscache);
 	}
@@ -119,7 +119,7 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode)
 		cifsi->fscache = fscache_acquire_cookie(
 					cifs_sb_master_tcon(cifs_sb)->fscache,
 					&cifs_fscache_inode_object_def,
-					cifsi);
+					cifsi, true);
 		cifs_dbg(FYI, "%s: new cookie 0x%p oldcookie 0x%p\n",
 			 __func__, cifsi->fscache, old);
 	}
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index d851aa555d28..29d7feb62cf7 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -58,15 +58,16 @@ void fscache_cookie_init_once(void *_cookie)
 struct fscache_cookie *__fscache_acquire_cookie(
 	struct fscache_cookie *parent,
 	const struct fscache_cookie_def *def,
-	void *netfs_data)
+	void *netfs_data,
+	bool enable)
 {
 	struct fscache_cookie *cookie;
 
 	BUG_ON(!def);
 
-	_enter("{%s},{%s},%p",
+	_enter("{%s},{%s},%p,%u",
 	       parent ? (char *) parent->def->name : "<no-parent>",
-	       def->name, netfs_data);
+	       def->name, netfs_data, enable);
 
 	fscache_stat(&fscache_n_acquires);
 
@@ -106,7 +107,7 @@ struct fscache_cookie *__fscache_acquire_cookie(
 	cookie->def		= def;
 	cookie->parent		= parent;
 	cookie->netfs_data	= netfs_data;
-	cookie->flags		= 0;
+	cookie->flags		= (1 << FSCACHE_COOKIE_NO_DATA_YET);
 
 	/* radix tree insertion won't use the preallocation pool unless it's
 	 * told it may not wait */
@@ -124,16 +125,22 @@ struct fscache_cookie *__fscache_acquire_cookie(
 		break;
 	}
 
-	/* if the object is an index then we need do nothing more here - we
-	 * create indices on disk when we need them as an index may exist in
-	 * multiple caches */
-	if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
-		if (fscache_acquire_non_index_cookie(cookie) < 0) {
-			atomic_dec(&parent->n_children);
-			__fscache_cookie_put(cookie);
-			fscache_stat(&fscache_n_acquires_nobufs);
-			_leave(" = NULL");
-			return NULL;
+	if (enable) {
+		/* if the object is an index then we need do nothing more here
+		 * - we create indices on disk when we need them as an index
+		 * may exist in multiple caches */
+		if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+			if (fscache_acquire_non_index_cookie(cookie) == 0) {
+				set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
+			} else {
+				atomic_dec(&parent->n_children);
+				__fscache_cookie_put(cookie);
+				fscache_stat(&fscache_n_acquires_nobufs);
+				_leave(" = NULL");
+				return NULL;
+			}
+		} else {
+			set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
 		}
 	}
 
@@ -143,6 +150,39 @@ struct fscache_cookie *__fscache_acquire_cookie(
 }
 EXPORT_SYMBOL(__fscache_acquire_cookie);
 
+/*
+ * Enable a cookie to permit it to accept new operations.
+ */
+void __fscache_enable_cookie(struct fscache_cookie *cookie,
+			     bool (*can_enable)(void *data),
+			     void *data)
+{
+	_enter("%p", cookie);
+
+	wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK,
+			 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+
+	if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags))
+		goto out_unlock;
+
+	if (can_enable && !can_enable(data)) {
+		/* The netfs decided it didn't want to enable after all */
+	} else if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+		/* Wait for outstanding disablement to complete */
+		__fscache_wait_on_invalidate(cookie);
+
+		if (fscache_acquire_non_index_cookie(cookie) == 0)
+			set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
+	} else {
+		set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
+	}
+
+out_unlock:
+	clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags);
+	wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK);
+}
+EXPORT_SYMBOL(__fscache_enable_cookie);
+
 /*
  * acquire a non-index cookie
  * - this must make sure the index chain is instantiated and instantiate the
@@ -157,7 +197,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
 
 	_enter("");
 
-	cookie->flags = 1 << FSCACHE_COOKIE_UNAVAILABLE;
+	set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
 
 	/* now we need to see whether the backing objects for this cookie yet
 	 * exist, if not there'll be nothing to search */
@@ -180,9 +220,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
 
 	_debug("cache %s", cache->tag->name);
 
-	cookie->flags =
-		(1 << FSCACHE_COOKIE_LOOKING_UP) |
-		(1 << FSCACHE_COOKIE_NO_DATA_YET);
+	set_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
 
 	/* ask the cache to allocate objects for this cookie and its parent
 	 * chain */
@@ -398,7 +436,8 @@ void __fscache_invalidate(struct fscache_cookie *cookie)
 	if (!hlist_empty(&cookie->backing_objects)) {
 		spin_lock(&cookie->lock);
 
-		if (!hlist_empty(&cookie->backing_objects) &&
+		if (fscache_cookie_enabled(cookie) &&
+		    !hlist_empty(&cookie->backing_objects) &&
 		    !test_and_set_bit(FSCACHE_COOKIE_INVALIDATING,
 				      &cookie->flags)) {
 			object = hlist_entry(cookie->backing_objects.first,
@@ -452,10 +491,14 @@ void __fscache_update_cookie(struct fscache_cookie *cookie)
 
 	spin_lock(&cookie->lock);
 
-	/* update the index entry on disk in each cache backing this cookie */
-	hlist_for_each_entry(object,
-			     &cookie->backing_objects, cookie_link) {
-		fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
+	if (fscache_cookie_enabled(cookie)) {
+		/* update the index entry on disk in each cache backing this
+		 * cookie.
+		 */
+		hlist_for_each_entry(object,
+				     &cookie->backing_objects, cookie_link) {
+			fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
+		}
 	}
 
 	spin_unlock(&cookie->lock);
@@ -464,28 +507,14 @@ void __fscache_update_cookie(struct fscache_cookie *cookie)
 EXPORT_SYMBOL(__fscache_update_cookie);
 
 /*
- * release a cookie back to the cache
- * - the object will be marked as recyclable on disk if retire is true
- * - all dependents of this cookie must have already been unregistered
- *   (indices/files/pages)
+ * Disable a cookie to stop it from accepting new requests from the netfs.
  */
-void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
+void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
 {
 	struct fscache_object *object;
+	bool awaken = false;
 
-	fscache_stat(&fscache_n_relinquishes);
-	if (retire)
-		fscache_stat(&fscache_n_relinquishes_retire);
-
-	if (!cookie) {
-		fscache_stat(&fscache_n_relinquishes_null);
-		_leave(" [no cookie]");
-		return;
-	}
-
-	_enter("%p{%s,%p,%d},%d",
-	       cookie, cookie->def->name, cookie->netfs_data,
-	       atomic_read(&cookie->n_active), retire);
+	_enter("%p,%u", cookie, invalidate);
 
 	ASSERTCMP(atomic_read(&cookie->n_active), >, 0);
 
@@ -495,24 +524,82 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
 		BUG();
 	}
 
-	/* No further netfs-accessing operations on this cookie permitted */
-	set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags);
-	if (retire)
-		set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
+	wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK,
+			 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+	if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags))
+		goto out_unlock_enable;
+
+	/* If the cookie is being invalidated, wait for that to complete first
+	 * so that we can reuse the flag.
+	 */
+	__fscache_wait_on_invalidate(cookie);
+
+	/* Dispose of the backing objects */
+	set_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags);
 
 	spin_lock(&cookie->lock);
-	hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) {
-		fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
+	if (!hlist_empty(&cookie->backing_objects)) {
+		hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) {
+			if (invalidate)
+				set_bit(FSCACHE_OBJECT_RETIRED, &object->flags);
+			fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
+		}
+	} else {
+		if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+			awaken = true;
 	}
 	spin_unlock(&cookie->lock);
+	if (awaken)
+		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
 
 	/* Wait for cessation of activity requiring access to the netfs (when
-	 * n_active reaches 0).
+	 * n_active reaches 0).  This makes sure outstanding reads and writes
+	 * have completed.
 	 */
 	if (!atomic_dec_and_test(&cookie->n_active))
 		wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
 				 TASK_UNINTERRUPTIBLE);
 
+	/* Reset the cookie state if it wasn't relinquished */
+	if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) {
+		atomic_inc(&cookie->n_active);
+		set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+	}
+
+out_unlock_enable:
+	clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags);
+	wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK);
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_disable_cookie);
+
+/*
+ * release a cookie back to the cache
+ * - the object will be marked as recyclable on disk if retire is true
+ * - all dependents of this cookie must have already been unregistered
+ *   (indices/files/pages)
+ */
+void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire)
+{
+	fscache_stat(&fscache_n_relinquishes);
+	if (retire)
+		fscache_stat(&fscache_n_relinquishes_retire);
+
+	if (!cookie) {
+		fscache_stat(&fscache_n_relinquishes_null);
+		_leave(" [no cookie]");
+		return;
+	}
+
+	_enter("%p{%s,%p,%d},%d",
+	       cookie, cookie->def->name, cookie->netfs_data,
+	       atomic_read(&cookie->n_active), retire);
+
+	/* No further netfs-accessing operations on this cookie permitted */
+	set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags);
+
+	__fscache_disable_cookie(cookie, retire);
+
 	/* Clear pointers back to the netfs */
 	cookie->netfs_data	= NULL;
 	cookie->def		= NULL;
@@ -592,7 +679,8 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
 
 	spin_lock(&cookie->lock);
 
-	if (hlist_empty(&cookie->backing_objects))
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
 		goto inconsistent;
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
index 10a2ade0bdf8..5a117df2a9ef 100644
--- a/fs/fscache/fsdef.c
+++ b/fs/fscache/fsdef.c
@@ -59,6 +59,7 @@ struct fscache_cookie fscache_fsdef_index = {
 	.lock		= __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
 	.backing_objects = HLIST_HEAD_INIT,
 	.def		= &fscache_fsdef_index_def,
+	.flags		= 1 << FSCACHE_COOKIE_ENABLED,
 };
 EXPORT_SYMBOL(fscache_fsdef_index);
 
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index b1bb6117473a..989f39401547 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -45,6 +45,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
 	netfs->primary_index->def		= &fscache_fsdef_netfs_def;
 	netfs->primary_index->parent		= &fscache_fsdef_index;
 	netfs->primary_index->netfs_data	= netfs;
+	netfs->primary_index->flags		= 1 << FSCACHE_COOKIE_ENABLED;
 
 	atomic_inc(&netfs->primary_index->parent->usage);
 	atomic_inc(&netfs->primary_index->parent->n_children);
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 86d75a60b20c..dcb821617774 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -495,6 +495,7 @@ void fscache_object_lookup_negative(struct fscache_object *object)
 		 * returning ENODATA.
 		 */
 		set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+		clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
 
 		_debug("wake up lookup %p", &cookie->flags);
 		clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
@@ -527,6 +528,7 @@ void fscache_obtained_object(struct fscache_object *object)
 
 		/* We do (presumably) have data */
 		clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+		clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
 
 		/* Allow write requests to begin stacking up and read requests
 		 * to begin shovelling data.
@@ -679,7 +681,8 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob
 	 */
 	spin_lock(&cookie->lock);
 	hlist_del_init(&object->cookie_link);
-	if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+	if (hlist_empty(&cookie->backing_objects) &&
+	    test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
 		awaken = true;
 	spin_unlock(&cookie->lock);
 
@@ -927,7 +930,7 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj
 	 */
 	if (!fscache_use_cookie(object)) {
 		ASSERT(object->cookie->stores.rnode == NULL);
-		set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
+		set_bit(FSCACHE_OBJECT_RETIRED, &object->flags);
 		_leave(" [no cookie]");
 		return transit_to(KILL_OBJECT);
 	}
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 0fe42a6d0e9c..7f5c658af755 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -204,7 +204,8 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 
 	spin_lock(&cookie->lock);
 
-	if (hlist_empty(&cookie->backing_objects))
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
 		goto nobufs;
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
@@ -410,7 +411,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 		return -ERESTARTSYS;
 
 	op = fscache_alloc_retrieval(cookie, page->mapping,
-				     end_io_func,context);
+				     end_io_func, context);
 	if (!op) {
 		_leave(" = -ENOMEM");
 		return -ENOMEM;
@@ -419,7 +420,8 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 
 	spin_lock(&cookie->lock);
 
-	if (hlist_empty(&cookie->backing_objects))
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
 		goto nobufs_unlock;
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
@@ -551,7 +553,8 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 
 	spin_lock(&cookie->lock);
 
-	if (hlist_empty(&cookie->backing_objects))
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
 		goto nobufs_unlock;
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
@@ -666,7 +669,8 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
 
 	spin_lock(&cookie->lock);
 
-	if (hlist_empty(&cookie->backing_objects))
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
 		goto nobufs_unlock;
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
@@ -938,7 +942,8 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	ret = -ENOBUFS;
 	spin_lock(&cookie->lock);
 
-	if (hlist_empty(&cookie->backing_objects))
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
 		goto nobufs;
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 24d1d1c5fcaf..cd6e7efd4305 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -39,7 +39,7 @@ void nfs_fscache_get_client_cookie(struct nfs_client *clp)
 	/* create a cache index for looking up filehandles */
 	clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,
 					      &nfs_fscache_server_index_def,
-					      clp);
+					      clp, true);
 	dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",
 		 clp, clp->fscache);
 }
@@ -139,7 +139,7 @@ void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int
 	/* create a cache index for looking up filehandles */
 	nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
 					       &nfs_fscache_super_index_def,
-					       nfss);
+					       nfss, true);
 	dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
 		 nfss, nfss->fscache);
 	return;
@@ -200,7 +200,7 @@ static void nfs_fscache_enable_inode_cookie(struct inode *inode)
 		nfsi->fscache = fscache_acquire_cookie(
 			NFS_SB(sb)->fscache,
 			&nfs_fscache_inode_object_def,
-			nfsi);
+			nfsi, true);
 
 		dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n",
 			 sb, nfsi, nfsi->fscache);
@@ -327,7 +327,7 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode)
 		nfsi->fscache = fscache_acquire_cookie(
 			nfss->nfs_client->fscache,
 			&nfs_fscache_inode_object_def,
-			nfsi);
+			nfsi, true);
 
 		dfprintk(FSCACHE,
 			 "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n",
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 96a2b66f5968..771484993ca7 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -308,36 +308,6 @@ struct fscache_cache_ops {
 	void (*dissociate_pages)(struct fscache_cache *cache);
 };
 
-/*
- * data file or index object cookie
- * - a file will only appear in one cache
- * - a request to cache a file may or may not be honoured, subject to
- *   constraints such as disk space
- * - indices are created on disk just-in-time
- */
-struct fscache_cookie {
-	atomic_t			usage;		/* number of users of this cookie */
-	atomic_t			n_children;	/* number of children of this cookie */
-	atomic_t			n_active;	/* number of active users of netfs ptrs */
-	spinlock_t			lock;
-	spinlock_t			stores_lock;	/* lock on page store tree */
-	struct hlist_head		backing_objects; /* object(s) backing this file/index */
-	const struct fscache_cookie_def	*def;		/* definition */
-	struct fscache_cookie		*parent;	/* parent of this entry */
-	void				*netfs_data;	/* back pointer to netfs */
-	struct radix_tree_root		stores;		/* pages to be stored on this cookie */
-#define FSCACHE_COOKIE_PENDING_TAG	0		/* pages tag: pending write to cache */
-#define FSCACHE_COOKIE_STORING_TAG	1		/* pages tag: writing to cache */
-
-	unsigned long			flags;
-#define FSCACHE_COOKIE_LOOKING_UP	0	/* T if non-index cookie being looked up still */
-#define FSCACHE_COOKIE_NO_DATA_YET	1	/* T if new object with no cached data yet */
-#define FSCACHE_COOKIE_UNAVAILABLE	2	/* T if cookie is unavailable (error, etc) */
-#define FSCACHE_COOKIE_INVALIDATING	3	/* T if cookie is being invalidated */
-#define FSCACHE_COOKIE_RELINQUISHED	4	/* T if cookie has been relinquished */
-#define FSCACHE_COOKIE_RETIRED		5	/* T if cookie was retired */
-};
-
 extern struct fscache_cookie fscache_fsdef_index;
 
 /*
@@ -400,6 +370,7 @@ struct fscache_object {
 #define FSCACHE_OBJECT_IS_LIVE		3	/* T if object is not withdrawn or relinquished */
 #define FSCACHE_OBJECT_IS_LOOKED_UP	4	/* T if object has been looked up */
 #define FSCACHE_OBJECT_IS_AVAILABLE	5	/* T if object has become active */
+#define FSCACHE_OBJECT_RETIRED		6	/* T if object was retired on relinquishment */
 
 	struct list_head	cache_link;	/* link in cache->object_list */
 	struct hlist_node	cookie_link;	/* link in cookie->backing_objects */
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 19b46458e4e8..115bb81912cc 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -166,6 +166,42 @@ struct fscache_netfs {
 	struct list_head		link;		/* internal link */
 };
 
+/*
+ * data file or index object cookie
+ * - a file will only appear in one cache
+ * - a request to cache a file may or may not be honoured, subject to
+ *   constraints such as disk space
+ * - indices are created on disk just-in-time
+ */
+struct fscache_cookie {
+	atomic_t			usage;		/* number of users of this cookie */
+	atomic_t			n_children;	/* number of children of this cookie */
+	atomic_t			n_active;	/* number of active users of netfs ptrs */
+	spinlock_t			lock;
+	spinlock_t			stores_lock;	/* lock on page store tree */
+	struct hlist_head		backing_objects; /* object(s) backing this file/index */
+	const struct fscache_cookie_def	*def;		/* definition */
+	struct fscache_cookie		*parent;	/* parent of this entry */
+	void				*netfs_data;	/* back pointer to netfs */
+	struct radix_tree_root		stores;		/* pages to be stored on this cookie */
+#define FSCACHE_COOKIE_PENDING_TAG	0		/* pages tag: pending write to cache */
+#define FSCACHE_COOKIE_STORING_TAG	1		/* pages tag: writing to cache */
+
+	unsigned long			flags;
+#define FSCACHE_COOKIE_LOOKING_UP	0	/* T if non-index cookie being looked up still */
+#define FSCACHE_COOKIE_NO_DATA_YET	1	/* T if new object with no cached data yet */
+#define FSCACHE_COOKIE_UNAVAILABLE	2	/* T if cookie is unavailable (error, etc) */
+#define FSCACHE_COOKIE_INVALIDATING	3	/* T if cookie is being invalidated */
+#define FSCACHE_COOKIE_RELINQUISHED	4	/* T if cookie has been relinquished */
+#define FSCACHE_COOKIE_ENABLED		5	/* T if cookie is enabled */
+#define FSCACHE_COOKIE_ENABLEMENT_LOCK	6	/* T if cookie is being en/disabled */
+};
+
+static inline bool fscache_cookie_enabled(struct fscache_cookie *cookie)
+{
+	return test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
+}
+
 /*
  * slow-path functions for when there is actually caching available, and the
  * netfs does actually have a valid token
@@ -181,8 +217,8 @@ extern void __fscache_release_cache_tag(struct fscache_cache_tag *);
 extern struct fscache_cookie *__fscache_acquire_cookie(
 	struct fscache_cookie *,
 	const struct fscache_cookie_def *,
-	void *);
-extern void __fscache_relinquish_cookie(struct fscache_cookie *, int);
+	void *, bool);
+extern void __fscache_relinquish_cookie(struct fscache_cookie *, bool);
 extern int __fscache_check_consistency(struct fscache_cookie *);
 extern void __fscache_update_cookie(struct fscache_cookie *);
 extern int __fscache_attr_changed(struct fscache_cookie *);
@@ -211,6 +247,9 @@ extern void __fscache_uncache_all_inode_pages(struct fscache_cookie *,
 					      struct inode *);
 extern void __fscache_readpages_cancel(struct fscache_cookie *cookie,
 				       struct list_head *pages);
+extern void __fscache_disable_cookie(struct fscache_cookie *, bool);
+extern void __fscache_enable_cookie(struct fscache_cookie *,
+				    bool (*)(void *), void *);
 
 /**
  * fscache_register_netfs - Register a filesystem as desiring caching services
@@ -289,6 +328,7 @@ void fscache_release_cache_tag(struct fscache_cache_tag *tag)
  * @def: A description of the cache object, including callback operations
  * @netfs_data: An arbitrary piece of data to be kept in the cookie to
  * represent the cache object to the netfs
+ * @enable: Whether or not to enable a data cookie immediately
  *
  * This function is used to inform FS-Cache about part of an index hierarchy
  * that can be used to locate files.  This is done by requesting a cookie for
@@ -301,10 +341,12 @@ static inline
 struct fscache_cookie *fscache_acquire_cookie(
 	struct fscache_cookie *parent,
 	const struct fscache_cookie_def *def,
-	void *netfs_data)
+	void *netfs_data,
+	bool enable)
 {
-	if (fscache_cookie_valid(parent))
-		return __fscache_acquire_cookie(parent, def, netfs_data);
+	if (fscache_cookie_valid(parent) && fscache_cookie_enabled(parent))
+		return __fscache_acquire_cookie(parent, def, netfs_data,
+						enable);
 	else
 		return NULL;
 }
@@ -322,7 +364,7 @@ struct fscache_cookie *fscache_acquire_cookie(
  * description.
  */
 static inline
-void fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
+void fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire)
 {
 	if (fscache_cookie_valid(cookie))
 		__fscache_relinquish_cookie(cookie, retire);
@@ -341,7 +383,7 @@ void fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
 static inline
 int fscache_check_consistency(struct fscache_cookie *cookie)
 {
-	if (fscache_cookie_valid(cookie))
+	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
 		return __fscache_check_consistency(cookie);
 	else
 		return 0;
@@ -360,7 +402,7 @@ int fscache_check_consistency(struct fscache_cookie *cookie)
 static inline
 void fscache_update_cookie(struct fscache_cookie *cookie)
 {
-	if (fscache_cookie_valid(cookie))
+	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
 		__fscache_update_cookie(cookie);
 }
 
@@ -407,7 +449,7 @@ void fscache_unpin_cookie(struct fscache_cookie *cookie)
 static inline
 int fscache_attr_changed(struct fscache_cookie *cookie)
 {
-	if (fscache_cookie_valid(cookie))
+	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
 		return __fscache_attr_changed(cookie);
 	else
 		return -ENOBUFS;
@@ -429,7 +471,7 @@ int fscache_attr_changed(struct fscache_cookie *cookie)
 static inline
 void fscache_invalidate(struct fscache_cookie *cookie)
 {
-	if (fscache_cookie_valid(cookie))
+	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
 		__fscache_invalidate(cookie);
 }
 
@@ -503,7 +545,7 @@ int fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 			       void *context,
 			       gfp_t gfp)
 {
-	if (fscache_cookie_valid(cookie))
+	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
 		return __fscache_read_or_alloc_page(cookie, page, end_io_func,
 						    context, gfp);
 	else
@@ -554,7 +596,7 @@ int fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 				void *context,
 				gfp_t gfp)
 {
-	if (fscache_cookie_valid(cookie))
+	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
 		return __fscache_read_or_alloc_pages(cookie, mapping, pages,
 						     nr_pages, end_io_func,
 						     context, gfp);
@@ -585,7 +627,7 @@ int fscache_alloc_page(struct fscache_cookie *cookie,
 		       struct page *page,
 		       gfp_t gfp)
 {
-	if (fscache_cookie_valid(cookie))
+	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
 		return __fscache_alloc_page(cookie, page, gfp);
 	else
 		return -ENOBUFS;
@@ -634,7 +676,7 @@ int fscache_write_page(struct fscache_cookie *cookie,
 		       struct page *page,
 		       gfp_t gfp)
 {
-	if (fscache_cookie_valid(cookie))
+	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
 		return __fscache_write_page(cookie, page, gfp);
 	else
 		return -ENOBUFS;
@@ -744,4 +786,47 @@ void fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
 		__fscache_uncache_all_inode_pages(cookie, inode);
 }
 
+/**
+ * fscache_disable_cookie - Disable a cookie
+ * @cookie: The cookie representing the cache object
+ * @invalidate: Invalidate the backing object
+ *
+ * Disable a cookie from accepting further alloc, read, write, invalidate,
+ * update or acquire operations.  Outstanding operations can still be waited
+ * upon and pages can still be uncached and the cookie relinquished.
+ *
+ * This will not return until all outstanding operations have completed.
+ *
+ * If @invalidate is set, then the backing object will be invalidated and
+ * detached, otherwise it will just be detached.
+ */
+static inline
+void fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
+{
+	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
+		__fscache_disable_cookie(cookie, invalidate);
+}
+
+/**
+ * fscache_enable_cookie - Reenable a cookie
+ * @cookie: The cookie representing the cache object
+ * @can_enable: A function to permit enablement once lock is held
+ * @data: Data for can_enable()
+ *
+ * Reenable a previously disabled cookie, allowing it to accept further alloc,
+ * read, write, invalidate, update or acquire operations.  An attempt will be
+ * made to immediately reattach the cookie to a backing object.
+ *
+ * The can_enable() function is called (if not NULL) once the enablement lock
+ * is held to rule on whether enablement is still permitted to go ahead.
+ */
+static inline
+void fscache_enable_cookie(struct fscache_cookie *cookie,
+			   bool (*can_enable)(void *data),
+			   void *data)
+{
+	if (fscache_cookie_valid(cookie) && !fscache_cookie_enabled(cookie))
+		__fscache_enable_cookie(cookie, can_enable, data);
+}
+
 #endif /* _LINUX_FSCACHE_H */
-- 
cgit v1.3


From f1fe29b4a02d0805aa7d0ff6b73410a9f9316d69 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 27 Sep 2013 11:20:03 +0100
Subject: NFS: Use i_writecount to control whether to get an fscache cookie in
 nfs_open()

Use i_writecount to control whether to get an fscache cookie in nfs_open() as
NFS does not do write caching yet.  I *think* this is the cause of a problem
encountered by Mark Moseley whereby __fscache_uncache_page() gets a NULL
pointer dereference because cookie->def is NULL:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000010
IP: [<ffffffff812a1903>] __fscache_uncache_page+0x23/0x160
PGD 0
Thread overran stack, or stack corrupted
Oops: 0000 [#1] SMP
Modules linked in: ...
CPU: 7 PID: 18993 Comm: php Not tainted 3.11.1 #1
Hardware name: Dell Inc. PowerEdge R420/072XWF, BIOS 1.3.5 08/21/2012
task: ffff8804203460c0 ti: ffff880420346640
RIP: 0010:[<ffffffff812a1903>] __fscache_uncache_page+0x23/0x160
RSP: 0018:ffff8801053af878 EFLAGS: 00210286
RAX: 0000000000000000 RBX: ffff8800be2f8780 RCX: ffff88022ffae5e8
RDX: 0000000000004c66 RSI: ffffea00055ff440 RDI: ffff8800be2f8780
RBP: ffff8801053af898 R08: 0000000000000001 R09: 0000000000000003
R10: 0000000000000000 R11: 0000000000000000 R12: ffffea00055ff440
R13: 0000000000001000 R14: ffff8800c50be538 R15: 0000000000000000
FS: 0000000000000000(0000) GS:ffff88042fc60000(0063) knlGS:00000000e439c700
CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033
CR2: 0000000000000010 CR3: 0000000001d8f000 CR4: 00000000000607f0
Stack:
...
Call Trace:
[<ffffffff81365a72>] __nfs_fscache_invalidate_page+0x42/0x70
[<ffffffff813553d5>] nfs_invalidate_page+0x75/0x90
[<ffffffff811b8f5e>] truncate_inode_page+0x8e/0x90
[<ffffffff811b90ad>] truncate_inode_pages_range.part.12+0x14d/0x620
[<ffffffff81d6387d>] ? __mutex_lock_slowpath+0x1fd/0x2e0
[<ffffffff811b95d3>] truncate_inode_pages_range+0x53/0x70
[<ffffffff811b969d>] truncate_inode_pages+0x2d/0x40
[<ffffffff811b96ff>] truncate_pagecache+0x4f/0x70
[<ffffffff81356840>] nfs_setattr_update_inode+0xa0/0x120
[<ffffffff81368de4>] nfs3_proc_setattr+0xc4/0xe0
[<ffffffff81357f78>] nfs_setattr+0xc8/0x150
[<ffffffff8122d95b>] notify_change+0x1cb/0x390
[<ffffffff8120a55b>] do_truncate+0x7b/0xc0
[<ffffffff8121f96c>] do_last+0xa4c/0xfd0
[<ffffffff8121ffbc>] path_openat+0xcc/0x670
[<ffffffff81220a0e>] do_filp_open+0x4e/0xb0
[<ffffffff8120ba1f>] do_sys_open+0x13f/0x2b0
[<ffffffff8126aaf6>] compat_SyS_open+0x36/0x50
[<ffffffff81d7204c>] sysenter_dispatch+0x7/0x24

The code at the instruction pointer was disassembled:

> (gdb) disas __fscache_uncache_page
> Dump of assembler code for function __fscache_uncache_page:
> ...
> 0xffffffff812a18ff <+31>: mov 0x48(%rbx),%rax
> 0xffffffff812a1903 <+35>: cmpb $0x0,0x10(%rax)
> 0xffffffff812a1907 <+39>: je 0xffffffff812a19cd <__fscache_uncache_page+237>

These instructions make up:

	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);

That cmpb is the faulting instruction (%rax is 0).  So cookie->def is NULL -
which presumably means that the cookie has already been at least partway
through __fscache_relinquish_cookie().

What I think may be happening is something like a three-way race on the same
file:

	PROCESS 1	PROCESS 2	PROCESS 3
	===============	===============	===============
	open(O_TRUNC|O_WRONLY)
			open(O_RDONLY)
					open(O_WRONLY)
	-->nfs_open()
	-->nfs_fscache_set_inode_cookie()
	nfs_fscache_inode_lock()
	nfs_fscache_disable_inode_cookie()
	__fscache_relinquish_cookie()
	nfs_inode->fscache = NULL
	<--nfs_fscache_set_inode_cookie()

			-->nfs_open()
			-->nfs_fscache_set_inode_cookie()
			nfs_fscache_inode_lock()
			nfs_fscache_enable_inode_cookie()
			__fscache_acquire_cookie()
			nfs_inode->fscache = cookie
			<--nfs_fscache_set_inode_cookie()
	<--nfs_open()
	-->nfs_setattr()
	...
	...
	-->nfs_invalidate_page()
	-->__nfs_fscache_invalidate_page()
	cookie = nfsi->fscache
					-->nfs_open()
					-->nfs_fscache_set_inode_cookie()
					nfs_fscache_inode_lock()
					nfs_fscache_disable_inode_cookie()
					-->__fscache_relinquish_cookie()
	-->__fscache_uncache_page(cookie)
	<crash>
					<--__fscache_relinquish_cookie()
					nfs_inode->fscache = NULL
					<--nfs_fscache_set_inode_cookie()

What is needed is something to prevent process #2 from reacquiring the cookie
- and I think checking i_writecount should do the trick.

It's also possible to have a two-way race on this if the file is opened
O_TRUNC|O_RDONLY instead.

Reported-by: Mark Moseley <moseleymark@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/nfs/dir.c           |   2 +-
 fs/nfs/fscache.c       | 198 ++++++++++++++-----------------------------------
 fs/nfs/fscache.h       |  18 ++---
 fs/nfs/inode.c         |   6 +-
 fs/nfs/nfs4file.c      |   2 +-
 include/linux/fs.h     |   5 ++
 include/linux/nfs_fs.h |   8 +-
 7 files changed, 79 insertions(+), 160 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 854a8f05a610..4c5edcc8b6e9 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1381,7 +1381,7 @@ static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, i
 
 static int do_open(struct inode *inode, struct file *filp)
 {
-	nfs_fscache_set_inode_cookie(inode, filp);
+	nfs_fscache_open_file(inode, filp);
 	return 0;
 }
 
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index cd6e7efd4305..3ef01f0ba0bc 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -178,163 +178,79 @@ void nfs_fscache_release_super_cookie(struct super_block *sb)
 /*
  * Initialise the per-inode cache cookie pointer for an NFS inode.
  */
-void nfs_fscache_init_inode_cookie(struct inode *inode)
+void nfs_fscache_init_inode(struct inode *inode)
 {
-	NFS_I(inode)->fscache = NULL;
-	if (S_ISREG(inode->i_mode))
-		set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
-}
-
-/*
- * Get the per-inode cache cookie for an NFS inode.
- */
-static void nfs_fscache_enable_inode_cookie(struct inode *inode)
-{
-	struct super_block *sb = inode->i_sb;
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	if (nfsi->fscache || !NFS_FSCACHE(inode))
+	nfsi->fscache = NULL;
+	if (!S_ISREG(inode->i_mode))
 		return;
-
-	if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) {
-		nfsi->fscache = fscache_acquire_cookie(
-			NFS_SB(sb)->fscache,
-			&nfs_fscache_inode_object_def,
-			nfsi, true);
-
-		dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n",
-			 sb, nfsi, nfsi->fscache);
-	}
+	nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache,
+					       &nfs_fscache_inode_object_def,
+					       nfsi, false);
 }
 
 /*
  * Release a per-inode cookie.
  */
-void nfs_fscache_release_inode_cookie(struct inode *inode)
+void nfs_fscache_clear_inode(struct inode *inode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
+	struct fscache_cookie *cookie = nfs_i_fscache(inode);
 
-	dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n",
-		 nfsi, nfsi->fscache);
+	dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie);
 
-	fscache_relinquish_cookie(nfsi->fscache, 0);
+	fscache_relinquish_cookie(cookie, false);
 	nfsi->fscache = NULL;
 }
 
-/*
- * Retire a per-inode cookie, destroying the data attached to it.
- */
-void nfs_fscache_zap_inode_cookie(struct inode *inode)
+static bool nfs_fscache_can_enable(void *data)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
+	struct inode *inode = data;
 
-	dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n",
-		 nfsi, nfsi->fscache);
-
-	fscache_relinquish_cookie(nfsi->fscache, 1);
-	nfsi->fscache = NULL;
+	return !inode_is_open_for_write(inode);
 }
 
 /*
- * Turn off the cache with regard to a per-inode cookie if opened for writing,
- * invalidating all the pages in the page cache relating to the associated
- * inode to clear the per-page caching.
- */
-static void nfs_fscache_disable_inode_cookie(struct inode *inode)
-{
-	clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
-
-	if (NFS_I(inode)->fscache) {
-		dfprintk(FSCACHE,
-			 "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode));
-
-		/* Need to uncache any pages attached to this inode that
-		 * fscache knows about before turning off the cache.
-		 */
-		fscache_uncache_all_inode_pages(NFS_I(inode)->fscache, inode);
-		nfs_fscache_zap_inode_cookie(inode);
-	}
-}
-
-/*
- * wait_on_bit() sleep function for uninterruptible waiting
- */
-static int nfs_fscache_wait_bit(void *flags)
-{
-	schedule();
-	return 0;
-}
-
-/*
- * Lock against someone else trying to also acquire or relinquish a cookie
- */
-static inline void nfs_fscache_inode_lock(struct inode *inode)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-
-	while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags))
-		wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK,
-			    nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE);
-}
-
-/*
- * Unlock cookie management lock
- */
-static inline void nfs_fscache_inode_unlock(struct inode *inode)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-
-	smp_mb__before_clear_bit();
-	clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags);
-	smp_mb__after_clear_bit();
-	wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK);
-}
-
-/*
- * Decide if we should enable or disable local caching for this inode.
- * - For now, with NFS, only regular files that are open read-only will be able
- *   to use the cache.
- * - May be invoked multiple times in parallel by parallel nfs_open() functions.
- */
-void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
-{
-	if (NFS_FSCACHE(inode)) {
-		nfs_fscache_inode_lock(inode);
-		if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
-			nfs_fscache_disable_inode_cookie(inode);
-		else
-			nfs_fscache_enable_inode_cookie(inode);
-		nfs_fscache_inode_unlock(inode);
-	}
-}
-EXPORT_SYMBOL_GPL(nfs_fscache_set_inode_cookie);
-
-/*
- * Replace a per-inode cookie due to revalidation detecting a file having
- * changed on the server.
+ * Enable or disable caching for a file that is being opened as appropriate.
+ * The cookie is allocated when the inode is initialised, but is not enabled at
+ * that time.  Enablement is deferred to file-open time to avoid stat() and
+ * access() thrashing the cache.
+ *
+ * For now, with NFS, only regular files that are open read-only will be able
+ * to use the cache.
+ *
+ * We enable the cache for an inode if we open it read-only and it isn't
+ * currently open for writing.  We disable the cache if the inode is open
+ * write-only.
+ *
+ * The caller uses the file struct to pin i_writecount on the inode before
+ * calling us when a file is opened for writing, so we can make use of that.
+ *
+ * Note that this may be invoked multiple times in parallel by parallel
+ * nfs_open() functions.
  */
-void nfs_fscache_reset_inode_cookie(struct inode *inode)
+void nfs_fscache_open_file(struct inode *inode, struct file *filp)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
-	struct nfs_server *nfss = NFS_SERVER(inode);
-	NFS_IFDEBUG(struct fscache_cookie *old = nfsi->fscache);
+	struct fscache_cookie *cookie = nfs_i_fscache(inode);
 
-	nfs_fscache_inode_lock(inode);
-	if (nfsi->fscache) {
-		/* retire the current fscache cache and get a new one */
-		fscache_relinquish_cookie(nfsi->fscache, 1);
-
-		nfsi->fscache = fscache_acquire_cookie(
-			nfss->nfs_client->fscache,
-			&nfs_fscache_inode_object_def,
-			nfsi, true);
+	if (!fscache_cookie_valid(cookie))
+		return;
 
-		dfprintk(FSCACHE,
-			 "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n",
-			 nfss, nfsi, old, nfsi->fscache);
+	if (inode_is_open_for_write(inode)) {
+		dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi);
+		clear_bit(NFS_INO_FSCACHE, &nfsi->flags);
+		fscache_disable_cookie(cookie, true);
+		fscache_uncache_all_inode_pages(cookie, inode);
+	} else {
+		dfprintk(FSCACHE, "NFS: nfsi 0x%p enabling cache\n", nfsi);
+		fscache_enable_cookie(cookie, nfs_fscache_can_enable, inode);
+		if (fscache_cookie_enabled(cookie))
+			set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
 	}
-	nfs_fscache_inode_unlock(inode);
 }
+EXPORT_SYMBOL_GPL(nfs_fscache_open_file);
 
 /*
  * Release the caching state associated with a page, if the page isn't busy
@@ -344,12 +260,11 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode)
 int nfs_fscache_release_page(struct page *page, gfp_t gfp)
 {
 	if (PageFsCache(page)) {
-		struct nfs_inode *nfsi = NFS_I(page->mapping->host);
-		struct fscache_cookie *cookie = nfsi->fscache;
+		struct fscache_cookie *cookie = nfs_i_fscache(page->mapping->host);
 
 		BUG_ON(!cookie);
 		dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
-			 cookie, page, nfsi);
+			 cookie, page, NFS_I(page->mapping->host));
 
 		if (!fscache_maybe_release_page(cookie, page, gfp))
 			return 0;
@@ -367,13 +282,12 @@ int nfs_fscache_release_page(struct page *page, gfp_t gfp)
  */
 void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
-	struct fscache_cookie *cookie = nfsi->fscache;
+	struct fscache_cookie *cookie = nfs_i_fscache(inode);
 
 	BUG_ON(!cookie);
 
 	dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n",
-		 cookie, page, nfsi);
+		 cookie, page, NFS_I(inode));
 
 	fscache_wait_on_page_write(cookie, page);
 
@@ -417,9 +331,9 @@ int __nfs_readpage_from_fscache(struct nfs_open_context *ctx,
 
 	dfprintk(FSCACHE,
 		 "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n",
-		 NFS_I(inode)->fscache, page, page->index, page->flags, inode);
+		 nfs_i_fscache(inode), page, page->index, page->flags, inode);
 
-	ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache,
+	ret = fscache_read_or_alloc_page(nfs_i_fscache(inode),
 					 page,
 					 nfs_readpage_from_fscache_complete,
 					 ctx,
@@ -459,9 +373,9 @@ int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
 	int ret;
 
 	dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
-		 NFS_I(inode)->fscache, npages, inode);
+		 nfs_i_fscache(inode), npages, inode);
 
-	ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache,
+	ret = fscache_read_or_alloc_pages(nfs_i_fscache(inode),
 					  mapping, pages, nr_pages,
 					  nfs_readpage_from_fscache_complete,
 					  ctx,
@@ -506,15 +420,15 @@ void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)
 
 	dfprintk(FSCACHE,
 		 "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n",
-		 NFS_I(inode)->fscache, page, page->index, page->flags, sync);
+		 nfs_i_fscache(inode), page, page->index, page->flags, sync);
 
-	ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL);
+	ret = fscache_write_page(nfs_i_fscache(inode), page, GFP_KERNEL);
 	dfprintk(FSCACHE,
 		 "NFS:     readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",
 		 page, page->index, page->flags, ret);
 
 	if (ret != 0) {
-		fscache_uncache_page(NFS_I(inode)->fscache, page);
+		fscache_uncache_page(nfs_i_fscache(inode), page);
 		nfs_add_fscache_stats(inode,
 				      NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1);
 		nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 4ecb76652eba..d7fe3e799f2f 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -76,11 +76,9 @@ extern void nfs_fscache_release_client_cookie(struct nfs_client *);
 extern void nfs_fscache_get_super_cookie(struct super_block *, const char *, int);
 extern void nfs_fscache_release_super_cookie(struct super_block *);
 
-extern void nfs_fscache_init_inode_cookie(struct inode *);
-extern void nfs_fscache_release_inode_cookie(struct inode *);
-extern void nfs_fscache_zap_inode_cookie(struct inode *);
-extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *);
-extern void nfs_fscache_reset_inode_cookie(struct inode *);
+extern void nfs_fscache_init_inode(struct inode *);
+extern void nfs_fscache_clear_inode(struct inode *);
+extern void nfs_fscache_open_file(struct inode *, struct file *);
 
 extern void __nfs_fscache_invalidate_page(struct page *, struct inode *);
 extern int nfs_fscache_release_page(struct page *, gfp_t);
@@ -187,12 +185,10 @@ static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
 
 static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
 
-static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {}
-static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {}
-static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {}
-static inline void nfs_fscache_set_inode_cookie(struct inode *inode,
-						struct file *filp) {}
-static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_init_inode(struct inode *inode) {}
+static inline void nfs_fscache_clear_inode(struct inode *inode) {}
+static inline void nfs_fscache_open_file(struct inode *inode,
+					 struct file *filp) {}
 
 static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
 {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index eda8879171c4..bb90bff0cb7a 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -122,7 +122,7 @@ void nfs_clear_inode(struct inode *inode)
 	WARN_ON_ONCE(!list_empty(&NFS_I(inode)->open_files));
 	nfs_zap_acl_cache(inode);
 	nfs_access_zap_cache(inode);
-	nfs_fscache_release_inode_cookie(inode);
+	nfs_fscache_clear_inode(inode);
 }
 EXPORT_SYMBOL_GPL(nfs_clear_inode);
 
@@ -459,7 +459,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
 		nfsi->attrtimeo_timestamp = now;
 		nfsi->access_cache = RB_ROOT;
 
-		nfs_fscache_init_inode_cookie(inode);
+		nfs_fscache_init_inode(inode);
 
 		unlock_new_inode(inode);
 	} else
@@ -854,7 +854,7 @@ int nfs_open(struct inode *inode, struct file *filp)
 		return PTR_ERR(ctx);
 	nfs_file_set_open_context(filp, ctx);
 	put_nfs_open_context(ctx);
-	nfs_fscache_set_inode_cookie(inode, filp);
+	nfs_fscache_open_file(inode, filp);
 	return 0;
 }
 
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index e5b804dd944c..5b8a618a0f7a 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -74,7 +74,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	nfs_file_set_open_context(filp, ctx);
-	nfs_fscache_set_inode_cookie(inode, filp);
+	nfs_fscache_open_file(inode, filp);
 	err = 0;
 
 out_put_ctx:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3f40547ba191..955dff5da56a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2292,6 +2292,11 @@ static inline void allow_write_access(struct file *file)
 	if (file)
 		atomic_inc(&file_inode(file)->i_writecount);
 }
+static inline bool inode_is_open_for_write(const struct inode *inode)
+{
+	return atomic_read(&inode->i_writecount) > 0;
+}
+
 #ifdef CONFIG_IMA
 static inline void i_readcount_dec(struct inode *inode)
 {
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 3ea4cde8701c..14a48207a304 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -269,9 +269,13 @@ static inline int NFS_STALE(const struct inode *inode)
 	return test_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
 }
 
-static inline int NFS_FSCACHE(const struct inode *inode)
+static inline struct fscache_cookie *nfs_i_fscache(struct inode *inode)
 {
-	return test_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+#ifdef CONFIG_NFS_FSCACHE
+	return NFS_I(inode)->fscache;
+#else
+	return NULL;
+#endif
 }
 
 static inline __u64 NFS_FILEID(const struct inode *inode)
-- 
cgit v1.3


From f112a049712a5c07de25d511c3c6587a2b1a015e Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 30 Sep 2013 09:37:03 +1000
Subject: xfs: lockdep needs to know about 3 dquot-deep nesting

Michael Semon reported that xfs/299 generated this lockdep warning:

=============================================
[ INFO: possible recursive locking detected ]
3.12.0-rc2+ #2 Not tainted
---------------------------------------------
touch/21072 is trying to acquire lock:
 (&xfs_dquot_other_class){+.+...}, at: [<c12902fb>] xfs_trans_dqlockedjoin+0x57/0x64

but task is already holding lock:
 (&xfs_dquot_other_class){+.+...}, at: [<c12902fb>] xfs_trans_dqlockedjoin+0x57/0x64

other info that might help us debug this:
 Possible unsafe locking scenario:

       CPU0
       ----
  lock(&xfs_dquot_other_class);
  lock(&xfs_dquot_other_class);

 *** DEADLOCK ***

 May be due to missing lock nesting notation

7 locks held by touch/21072:
 #0:  (sb_writers#10){++++.+}, at: [<c11185b6>] mnt_want_write+0x1e/0x3e
 #1:  (&type->i_mutex_dir_key#4){+.+.+.}, at: [<c11078ee>] do_last+0x245/0xe40
 #2:  (sb_internal#2){++++.+}, at: [<c122c9e0>] xfs_trans_alloc+0x1f/0x35
 #3:  (&(&ip->i_lock)->mr_lock/1){+.+...}, at: [<c126cd1b>] xfs_ilock+0x100/0x1f1
 #4:  (&(&ip->i_lock)->mr_lock){++++-.}, at: [<c126cf52>] xfs_ilock_nowait+0x105/0x22f
 #5:  (&dqp->q_qlock){+.+...}, at: [<c12902fb>] xfs_trans_dqlockedjoin+0x57/0x64
 #6:  (&xfs_dquot_other_class){+.+...}, at: [<c12902fb>] xfs_trans_dqlockedjoin+0x57/0x64

The lockdep annotation for dquot lock nesting only understands
locking for user and "other" dquots, not user, group and quota
dquots. Fix the annotations to match the locking heirarchy we now
have.

Reported-by: Michael L. Semon <mlsemon35@gmail.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_dquot.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 71520e6e5d65..1ee776d477c3 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -64,7 +64,8 @@ int xfs_dqerror_mod = 33;
 struct kmem_zone		*xfs_qm_dqtrxzone;
 static struct kmem_zone		*xfs_qm_dqzone;
 
-static struct lock_class_key xfs_dquot_other_class;
+static struct lock_class_key xfs_dquot_group_class;
+static struct lock_class_key xfs_dquot_project_class;
 
 /*
  * This is called to free all the memory associated with a dquot
@@ -703,8 +704,20 @@ xfs_qm_dqread(
 	 * Make sure group quotas have a different lock class than user
 	 * quotas.
 	 */
-	if (!(type & XFS_DQ_USER))
-		lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
+	switch (type) {
+	case XFS_DQ_USER:
+		/* uses the default lock class */
+		break;
+	case XFS_DQ_GROUP:
+		lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class);
+		break;
+	case XFS_DQ_PROJ:
+		lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class);
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
 
 	XFS_STATS_INC(xs_qm_dquot);
 
-- 
cgit v1.3


From 367993e7c6428cb7617ab7653d61dca54e2fdede Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 30 Sep 2013 09:37:04 +1000
Subject: xfs: dirent dtype presence is dependent on directory magic numbers

The determination of whether a directory entry contains a dtype
field originally was dependent on the filesystem having CRCs
enabled. This meant that the format for dtype beign enabled could be
determined by checking the directory block magic number rather than
doing a feature bit check. This was useful in that it meant that we
didn't need to pass a struct xfs_mount around to functions that
were already supplied with a directory block header.

Unfortunately, the introduction of dtype fields into the v4
structure via a feature bit meant this "use the directory block
magic number" method of discriminating the dirent entry sizes is
broken. Hence we need to convert the places that use magic number
checks to use feature bit checks so that they work correctly and not
by chance.

The current code works on v4 filesystems only because the dirent
size roundup covers the extra byte needed by the dtype field in the
places where this problem occurs.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_dir2_block.c   |  6 +++---
 fs/xfs/xfs_dir2_format.h  | 51 +++++++++++++++++++----------------------------
 fs/xfs/xfs_dir2_readdir.c |  4 ++--
 fs/xfs/xfs_dir2_sf.c      |  6 +++---
 4 files changed, 28 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 0957aa98b6c0..12dad188939d 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -1158,7 +1158,7 @@ xfs_dir2_sf_to_block(
 	/*
 	 * Create entry for .
 	 */
-	dep = xfs_dir3_data_dot_entry_p(hdr);
+	dep = xfs_dir3_data_dot_entry_p(mp, hdr);
 	dep->inumber = cpu_to_be64(dp->i_ino);
 	dep->namelen = 1;
 	dep->name[0] = '.';
@@ -1172,7 +1172,7 @@ xfs_dir2_sf_to_block(
 	/*
 	 * Create entry for ..
 	 */
-	dep = xfs_dir3_data_dotdot_entry_p(hdr);
+	dep = xfs_dir3_data_dotdot_entry_p(mp, hdr);
 	dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp));
 	dep->namelen = 2;
 	dep->name[0] = dep->name[1] = '.';
@@ -1183,7 +1183,7 @@ xfs_dir2_sf_to_block(
 	blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
 	blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
 				(char *)dep - (char *)hdr));
-	offset = xfs_dir3_data_first_offset(hdr);
+	offset = xfs_dir3_data_first_offset(mp);
 	/*
 	 * Loop over existing entries, stuff them in.
 	 */
diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h
index a0961a61ac1a..9cf67381adf6 100644
--- a/fs/xfs/xfs_dir2_format.h
+++ b/fs/xfs/xfs_dir2_format.h
@@ -497,69 +497,58 @@ xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
 /*
  * Offsets of . and .. in data space (always block 0)
  *
- * The macros are used for shortform directories as they have no headers to read
- * the magic number out of. Shortform directories need to know the size of the
- * data block header because the sfe embeds the block offset of the entry into
- * it so that it doesn't change when format conversion occurs. Bad Things Happen
- * if we don't follow this rule.
- *
  * XXX: there is scope for significant optimisation of the logic here. Right
  * now we are checking for "dir3 format" over and over again. Ideally we should
  * only do it once for each operation.
  */
-#define	XFS_DIR3_DATA_DOT_OFFSET(mp)	\
-	xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&(mp)->m_sb))
-#define	XFS_DIR3_DATA_DOTDOT_OFFSET(mp)	\
-	(XFS_DIR3_DATA_DOT_OFFSET(mp) + xfs_dir3_data_entsize(mp, 1))
-#define	XFS_DIR3_DATA_FIRST_OFFSET(mp)		\
-	(XFS_DIR3_DATA_DOTDOT_OFFSET(mp) + xfs_dir3_data_entsize(mp, 2))
-
 static inline xfs_dir2_data_aoff_t
-xfs_dir3_data_dot_offset(struct xfs_dir2_data_hdr *hdr)
+xfs_dir3_data_dot_offset(struct xfs_mount *mp)
 {
-	return xfs_dir3_data_entry_offset(hdr);
+	return xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&mp->m_sb));
 }
 
 static inline xfs_dir2_data_aoff_t
-xfs_dir3_data_dotdot_offset(struct xfs_dir2_data_hdr *hdr)
+xfs_dir3_data_dotdot_offset(struct xfs_mount *mp)
 {
-	bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-		    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
-	return xfs_dir3_data_dot_offset(hdr) +
-		__xfs_dir3_data_entsize(dir3, 1);
+	return xfs_dir3_data_dot_offset(mp) +
+		xfs_dir3_data_entsize(mp, 1);
 }
 
 static inline xfs_dir2_data_aoff_t
-xfs_dir3_data_first_offset(struct xfs_dir2_data_hdr *hdr)
+xfs_dir3_data_first_offset(struct xfs_mount *mp)
 {
-	bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-		    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
-	return xfs_dir3_data_dotdot_offset(hdr) +
-		__xfs_dir3_data_entsize(dir3, 2);
+	return xfs_dir3_data_dotdot_offset(mp) +
+		xfs_dir3_data_entsize(mp, 2);
 }
 
 /*
  * location of . and .. in data space (always block 0)
  */
 static inline struct xfs_dir2_data_entry *
-xfs_dir3_data_dot_entry_p(struct xfs_dir2_data_hdr *hdr)
+xfs_dir3_data_dot_entry_p(
+	struct xfs_mount	*mp,
+	struct xfs_dir2_data_hdr *hdr)
 {
 	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + xfs_dir3_data_dot_offset(hdr));
+		((char *)hdr + xfs_dir3_data_dot_offset(mp));
 }
 
 static inline struct xfs_dir2_data_entry *
-xfs_dir3_data_dotdot_entry_p(struct xfs_dir2_data_hdr *hdr)
+xfs_dir3_data_dotdot_entry_p(
+	struct xfs_mount	*mp,
+	struct xfs_dir2_data_hdr *hdr)
 {
 	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + xfs_dir3_data_dotdot_offset(hdr));
+		((char *)hdr + xfs_dir3_data_dotdot_offset(mp));
 }
 
 static inline struct xfs_dir2_data_entry *
-xfs_dir3_data_first_entry_p(struct xfs_dir2_data_hdr *hdr)
+xfs_dir3_data_first_entry_p(
+	struct xfs_mount	*mp,
+	struct xfs_dir2_data_hdr *hdr)
 {
 	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + xfs_dir3_data_first_offset(hdr));
+		((char *)hdr + xfs_dir3_data_first_offset(mp));
 }
 
 /*
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 8993ec17452c..8f84153e98a8 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -119,9 +119,9 @@ xfs_dir2_sf_getdents(
 	 * mp->m_dirdatablk.
 	 */
 	dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
-					     XFS_DIR3_DATA_DOT_OFFSET(mp));
+					     xfs_dir3_data_dot_offset(mp));
 	dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
-						XFS_DIR3_DATA_DOTDOT_OFFSET(mp));
+						xfs_dir3_data_dotdot_offset(mp));
 
 	/*
 	 * Put . entry unless we're starting past it.
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index bb6e2848f473..3ef6d402084c 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -557,7 +557,7 @@ xfs_dir2_sf_addname_hard(
 	 * to insert the new entry.
 	 * If it's going to end up at the end then oldsfep will point there.
 	 */
-	for (offset = XFS_DIR3_DATA_FIRST_OFFSET(mp),
+	for (offset = xfs_dir3_data_first_offset(mp),
 	      oldsfep = xfs_dir2_sf_firstentry(oldsfp),
 	      add_datasize = xfs_dir3_data_entsize(mp, args->namelen),
 	      eof = (char *)oldsfep == &buf[old_isize];
@@ -640,7 +640,7 @@ xfs_dir2_sf_addname_pick(
 
 	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
 	size = xfs_dir3_data_entsize(mp, args->namelen);
-	offset = XFS_DIR3_DATA_FIRST_OFFSET(mp);
+	offset = xfs_dir3_data_first_offset(mp);
 	sfep = xfs_dir2_sf_firstentry(sfp);
 	holefit = 0;
 	/*
@@ -713,7 +713,7 @@ xfs_dir2_sf_check(
 	mp = dp->i_mount;
 
 	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	offset = XFS_DIR3_DATA_FIRST_OFFSET(mp);
+	offset = xfs_dir3_data_first_offset(mp);
 	ino = xfs_dir2_sf_get_parent_ino(sfp);
 	i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
 
-- 
cgit v1.3


From 519ccb81ac1c8e3e4eed294acf93be00b43dcad6 Mon Sep 17 00:00:00 2001
From: "tinguely@sgi.com" <tinguely@sgi.com>
Date: Fri, 27 Sep 2013 09:00:55 -0500
Subject: xfs: fix memory leak in xlog_recover_add_to_trans

Free the memory in error path of xlog_recover_add_to_trans().
Normally this memory is freed in recovery pass2, but is leaked
in the error path.

Signed-off-by: Mark Tinguely <tinguely@sgi.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_log_recover.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index cc179878fe41..43240583fd5b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1585,6 +1585,7 @@ xlog_recover_add_to_trans(
 		"bad number of regions (%d) in inode log format",
 				  in_f->ilf_size);
 			ASSERT(0);
+			free(ptr);
 			return XFS_ERROR(EIO);
 		}
 
-- 
cgit v1.3


From adcadfa8f373f301e9f622fbf45957adf2d98622 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Sat, 29 Jun 2013 21:42:20 +0400
Subject: fuse: Getting file for writeback helper

There will be a .writepageS callback implementation which will need to
get a fuse_file out of a fuse_inode, thus make a helper for this.

Signed-off-by: Maxim Patlasov <MPatlasov@parallels.com>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 4598345ab87d..aab6d05355e4 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1494,6 +1494,20 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
 	fuse_writepage_free(fc, req);
 }
 
+static struct fuse_file *fuse_write_file(struct fuse_conn *fc,
+					 struct fuse_inode *fi)
+{
+	struct fuse_file *ff;
+
+	spin_lock(&fc->lock);
+	BUG_ON(list_empty(&fi->write_files));
+	ff = list_entry(fi->write_files.next, struct fuse_file, write_entry);
+	fuse_file_get(ff);
+	spin_unlock(&fc->lock);
+
+	return ff;
+}
+
 static int fuse_writepage_locked(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
@@ -1501,7 +1515,6 @@ static int fuse_writepage_locked(struct page *page)
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_req *req;
-	struct fuse_file *ff;
 	struct page *tmp_page;
 
 	set_page_writeback(page);
@@ -1515,13 +1528,8 @@ static int fuse_writepage_locked(struct page *page)
 	if (!tmp_page)
 		goto err_free;
 
-	spin_lock(&fc->lock);
-	BUG_ON(list_empty(&fi->write_files));
-	ff = list_entry(fi->write_files.next, struct fuse_file, write_entry);
-	req->ff = fuse_file_get(ff);
-	spin_unlock(&fc->lock);
-
-	fuse_write_fill(req, ff, page_offset(page), 0);
+	req->ff = fuse_write_file(fc, fi);
+	fuse_write_fill(req, req->ff, page_offset(page), 0);
 
 	copy_highpage(tmp_page, page);
 	req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
-- 
cgit v1.3


From 385b126815d9278cf347504cf17ba62fb57e00ca Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Sat, 29 Jun 2013 21:42:48 +0400
Subject: fuse: Prepare to handle multiple pages in writeback

The .writepages callback will issue writeback requests with more than one
page aboard. Make existing end/check code be aware of this.

Signed-off-by: Maxim Patlasov <MPatlasov@parallels.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index aab6d05355e4..944acc096752 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -334,7 +334,8 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
 
 		BUG_ON(req->inode != inode);
 		curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
-		if (curr_index == index) {
+		if (curr_index <= index &&
+		    index < curr_index + req->num_pages) {
 			found = true;
 			break;
 		}
@@ -1409,7 +1410,10 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
 
 static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
 {
-	__free_page(req->pages[0]);
+	int i;
+
+	for (i = 0; i < req->num_pages; i++)
+		__free_page(req->pages[i]);
 	fuse_file_put(req->ff, false);
 }
 
@@ -1418,11 +1422,14 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 	struct inode *inode = req->inode;
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+	int i;
 
 	list_del(&req->writepages_entry);
-	dec_bdi_stat(bdi, BDI_WRITEBACK);
-	dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP);
-	bdi_writeout_inc(bdi);
+	for (i = 0; i < req->num_pages; i++) {
+		dec_bdi_stat(bdi, BDI_WRITEBACK);
+		dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP);
+		bdi_writeout_inc(bdi);
+	}
 	wake_up(&fi->page_waitq);
 }
 
@@ -1434,14 +1441,15 @@ __acquires(fc->lock)
 	struct fuse_inode *fi = get_fuse_inode(req->inode);
 	loff_t size = i_size_read(req->inode);
 	struct fuse_write_in *inarg = &req->misc.write.in;
+	__u64 data_size = req->num_pages * PAGE_CACHE_SIZE;
 
 	if (!fc->connected)
 		goto out_free;
 
-	if (inarg->offset + PAGE_CACHE_SIZE <= size) {
-		inarg->size = PAGE_CACHE_SIZE;
+	if (inarg->offset + data_size <= size) {
+		inarg->size = data_size;
 	} else if (inarg->offset < size) {
-		inarg->size = size & (PAGE_CACHE_SIZE - 1);
+		inarg->size = size - inarg->offset;
 	} else {
 		/* Got truncated off completely */
 		goto out_free;
-- 
cgit v1.3


From cca2437045dda994d23bd65891b71e091fa35b5a Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 1 Oct 2013 16:44:51 +0200
Subject: fuse: lock page in mkwrite

Lock the page in fuse_page_mkwrite() to protect against a race with
fuse_writepage() where the page is redirtied before the actual writeback
begins.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 944acc096752..6fb9425c23c6 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1618,14 +1618,17 @@ static void fuse_vma_close(struct vm_area_struct *vma)
 static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	/*
-	 * Don't use page->mapping as it may become NULL from a
-	 * concurrent truncate.
-	 */
-	struct inode *inode = vma->vm_file->f_mapping->host;
+	struct inode *inode = file_inode(vma->vm_file);
+
+	file_update_time(vma->vm_file);
+	lock_page(page);
+	if (page->mapping != inode->i_mapping) {
+		unlock_page(page);
+		return VM_FAULT_NOPAGE;
+	}
 
 	fuse_wait_on_page_writeback(inode, page->index);
-	return 0;
+	return VM_FAULT_LOCKED;
 }
 
 static const struct vm_operations_struct fuse_file_vm_ops = {
-- 
cgit v1.3


From 72523425fb434e81c0c9f611bd880ce339c4e06b Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 1 Oct 2013 16:44:52 +0200
Subject: fuse: don't BUG on no write file

Don't bug if there's no writable files found for page writeback.  If ever
this is triggered, a WARN_ON helps debugging it much better then a BUG_ON.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 6fb9425c23c6..6ce0066d7c8c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1505,12 +1505,14 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
 static struct fuse_file *fuse_write_file(struct fuse_conn *fc,
 					 struct fuse_inode *fi)
 {
-	struct fuse_file *ff;
+	struct fuse_file *ff = NULL;
 
 	spin_lock(&fc->lock);
-	BUG_ON(list_empty(&fi->write_files));
-	ff = list_entry(fi->write_files.next, struct fuse_file, write_entry);
-	fuse_file_get(ff);
+	if (!WARN_ON(list_empty(&fi->write_files))) {
+		ff = list_entry(fi->write_files.next, struct fuse_file,
+				write_entry);
+		fuse_file_get(ff);
+	}
 	spin_unlock(&fc->lock);
 
 	return ff;
@@ -1524,6 +1526,7 @@ static int fuse_writepage_locked(struct page *page)
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_req *req;
 	struct page *tmp_page;
+	int error = -ENOMEM;
 
 	set_page_writeback(page);
 
@@ -1536,7 +1539,11 @@ static int fuse_writepage_locked(struct page *page)
 	if (!tmp_page)
 		goto err_free;
 
+	error = -EIO;
 	req->ff = fuse_write_file(fc, fi);
+	if (!req->ff)
+		goto err_free;
+
 	fuse_write_fill(req, req->ff, page_offset(page), 0);
 
 	copy_highpage(tmp_page, page);
@@ -1566,7 +1573,7 @@ err_free:
 	fuse_request_free(req);
 err:
 	end_page_writeback(page);
-	return -ENOMEM;
+	return error;
 }
 
 static int fuse_writepage(struct page *page, struct writeback_control *wbc)
-- 
cgit v1.3


From 26d614df1da9d7d255686af5d6d4508f77853c01 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Sat, 29 Jun 2013 21:45:29 +0400
Subject: fuse: Implement writepages callback

The .writepages one is required to make each writeback request carry more than
one page on it. The patch enables optimized behaviour unconditionally,
i.e. mmap-ed writes will benefit from the patch even if fc->writeback_cache=0.

[SzM: simplify, add comments]

Signed-off-by: Maxim Patlasov <MPatlasov@parallels.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 147 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 6ce0066d7c8c..0bd349dd968f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1502,8 +1502,8 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
 	fuse_writepage_free(fc, req);
 }
 
-static struct fuse_file *fuse_write_file(struct fuse_conn *fc,
-					 struct fuse_inode *fi)
+static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
+					     struct fuse_inode *fi)
 {
 	struct fuse_file *ff = NULL;
 
@@ -1540,7 +1540,7 @@ static int fuse_writepage_locked(struct page *page)
 		goto err_free;
 
 	error = -EIO;
-	req->ff = fuse_write_file(fc, fi);
+	req->ff = fuse_write_file_get(fc, fi);
 	if (!req->ff)
 		goto err_free;
 
@@ -1586,6 +1586,149 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc)
 	return err;
 }
 
+struct fuse_fill_wb_data {
+	struct fuse_req *req;
+	struct fuse_file *ff;
+	struct inode *inode;
+};
+
+static void fuse_writepages_send(struct fuse_fill_wb_data *data)
+{
+	struct fuse_req *req = data->req;
+	struct inode *inode = data->inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	req->ff = fuse_file_get(data->ff);
+	spin_lock(&fc->lock);
+	list_add_tail(&req->list, &fi->queued_writes);
+	fuse_flush_writepages(inode);
+	spin_unlock(&fc->lock);
+}
+
+static int fuse_writepages_fill(struct page *page,
+		struct writeback_control *wbc, void *_data)
+{
+	struct fuse_fill_wb_data *data = _data;
+	struct fuse_req *req = data->req;
+	struct inode *inode = data->inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct page *tmp_page;
+	int err;
+
+	if (!data->ff) {
+		err = -EIO;
+		data->ff = fuse_write_file_get(fc, get_fuse_inode(inode));
+		if (!data->ff)
+			goto out_unlock;
+	}
+
+	if (req) {
+		BUG_ON(!req->num_pages);
+		if (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
+		    (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_write ||
+		    req->pages[req->num_pages - 1]->index + 1 != page->index) {
+
+			fuse_writepages_send(data);
+			data->req = NULL;
+		}
+	}
+	err = -ENOMEM;
+	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	if (!tmp_page)
+		goto out_unlock;
+
+	/*
+	 * The page must not be redirtied until the writeout is completed
+	 * (i.e. userspace has sent a reply to the write request).  Otherwise
+	 * there could be more than one temporary page instance for each real
+	 * page.
+	 *
+	 * This is ensured by holding the page lock in page_mkwrite() while
+	 * checking fuse_page_is_writeback().  We already hold the page lock
+	 * since clear_page_dirty_for_io() and keep it held until we add the
+	 * request to the fi->writepages list and increment req->num_pages.
+	 * After this fuse_page_is_writeback() will indicate that the page is
+	 * under writeback, so we can release the page lock.
+	 */
+	if (data->req == NULL) {
+		struct fuse_inode *fi = get_fuse_inode(inode);
+
+		err = -ENOMEM;
+		req = fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ);
+		if (!req) {
+			__free_page(tmp_page);
+			goto out_unlock;
+		}
+
+		fuse_write_fill(req, data->ff, page_offset(page), 0);
+		req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
+		req->in.argpages = 1;
+		req->background = 1;
+		req->num_pages = 0;
+		req->end = fuse_writepage_end;
+		req->inode = inode;
+
+		spin_lock(&fc->lock);
+		list_add(&req->writepages_entry, &fi->writepages);
+		spin_unlock(&fc->lock);
+
+		data->req = req;
+	}
+	set_page_writeback(page);
+
+	copy_highpage(tmp_page, page);
+	req->pages[req->num_pages] = tmp_page;
+	req->page_descs[req->num_pages].offset = 0;
+	req->page_descs[req->num_pages].length = PAGE_SIZE;
+
+	inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK);
+	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
+	end_page_writeback(page);
+
+	/*
+	 * Protected by fc->lock against concurrent access by
+	 * fuse_page_is_writeback().
+	 */
+	spin_lock(&fc->lock);
+	req->num_pages++;
+	spin_unlock(&fc->lock);
+
+	err = 0;
+out_unlock:
+	unlock_page(page);
+
+	return err;
+}
+
+static int fuse_writepages(struct address_space *mapping,
+			   struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	struct fuse_fill_wb_data data;
+	int err;
+
+	err = -EIO;
+	if (is_bad_inode(inode))
+		goto out;
+
+	data.inode = inode;
+	data.req = NULL;
+	data.ff = NULL;
+
+	err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
+	if (data.req) {
+		/* Ignore errors if we can write at least one page */
+		BUG_ON(!data.req->num_pages);
+		fuse_writepages_send(&data);
+		err = 0;
+	}
+	if (data.ff)
+		fuse_file_put(data.ff, false);
+out:
+	return err;
+}
+
 static int fuse_launder_page(struct page *page)
 {
 	int err = 0;
@@ -2607,6 +2750,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
 static const struct address_space_operations fuse_file_aops  = {
 	.readpage	= fuse_readpage,
 	.writepage	= fuse_writepage,
+	.writepages	= fuse_writepages,
 	.launder_page	= fuse_launder_page,
 	.readpages	= fuse_readpages,
 	.set_page_dirty	= __set_page_dirty_nobuffers,
-- 
cgit v1.3


From 2d033eaa0073d276ee6c324dd0ade0c5074a5542 Mon Sep 17 00:00:00 2001
From: Maxim Patlasov <MPatlasov@parallels.com>
Date: Fri, 16 Aug 2013 15:51:41 +0400
Subject: fuse: fix race in fuse_writepages()

The patch fixes a race between ftruncate(2), mmap-ed write and write(2):

1) An user makes a page dirty via mmap-ed write.
2) The user performs shrinking truncate(2) intended to purge the page.
3) Before fuse_do_setattr calls truncate_pagecache, the page goes to
   writeback. fuse_writepages_fill attaches a new page to FUSE_WRITE request,
   then releases the original page by end_page_writeback and unlock it.
4) fuse_do_setattr completes and successfully returns. Since now, i_mutex
   is free.
5) Ordinary write(2) extends i_size back to cover the page. Note that
   fuse_send_write_pages do wait for fuse writeback, but for another
   page->index.
6) fuse_writepages_fill attaches more pages to the request (if any), then
   fuse_writepages_send is eventually called. It is supposed to crop
   inarg->size of the request, but it doesn't because i_size has already been
   extended back.

Moving end_page_writeback behind fuse_writepages_send guarantees that
__fuse_release_nowrite (called from fuse_do_setattr) will crop inarg->size
of the request before write(2) gets the chance to extend i_size.

Signed-off-by: Maxim Patlasov <mpatlasov@parallels.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 0bd349dd968f..cc3a6c4437e4 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1590,6 +1590,7 @@ struct fuse_fill_wb_data {
 	struct fuse_req *req;
 	struct fuse_file *ff;
 	struct inode *inode;
+	struct page **orig_pages;
 };
 
 static void fuse_writepages_send(struct fuse_fill_wb_data *data)
@@ -1598,12 +1599,17 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data)
 	struct inode *inode = data->inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
+	int num_pages = req->num_pages;
+	int i;
 
 	req->ff = fuse_file_get(data->ff);
 	spin_lock(&fc->lock);
 	list_add_tail(&req->list, &fi->queued_writes);
 	fuse_flush_writepages(inode);
 	spin_unlock(&fc->lock);
+
+	for (i = 0; i < num_pages; i++)
+		end_page_writeback(data->orig_pages[i]);
 }
 
 static int fuse_writepages_fill(struct page *page,
@@ -1684,7 +1690,7 @@ static int fuse_writepages_fill(struct page *page,
 
 	inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK);
 	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
-	end_page_writeback(page);
+	data->orig_pages[req->num_pages] = page;
 
 	/*
 	 * Protected by fc->lock against concurrent access by
@@ -1716,6 +1722,13 @@ static int fuse_writepages(struct address_space *mapping,
 	data.req = NULL;
 	data.ff = NULL;
 
+	err = -ENOMEM;
+	data.orig_pages = kzalloc(sizeof(struct page *) *
+				  FUSE_MAX_PAGES_PER_REQ,
+				  GFP_NOFS);
+	if (!data.orig_pages)
+		goto out;
+
 	err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
 	if (data.req) {
 		/* Ignore errors if we can write at least one page */
@@ -1725,6 +1738,8 @@ static int fuse_writepages(struct address_space *mapping,
 	}
 	if (data.ff)
 		fuse_file_put(data.ff, false);
+
+	kfree(data.orig_pages);
 out:
 	return err;
 }
-- 
cgit v1.3


From 1e112a484e58cd13e6c90a3dfd9e54ea5859b795 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 1 Oct 2013 16:44:53 +0200
Subject: fuse: writepages: fix aggregation

Checking against tmp-page indexes is not very useful, and results in one
(or rarely two) page requests.  Which is not much of an improvement...

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index cc3a6c4437e4..bf765cf7b112 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1633,7 +1633,7 @@ static int fuse_writepages_fill(struct page *page,
 		BUG_ON(!req->num_pages);
 		if (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
 		    (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_write ||
-		    req->pages[req->num_pages - 1]->index + 1 != page->index) {
+		    data->orig_pages[req->num_pages - 1]->index + 1 != page->index) {
 
 			fuse_writepages_send(data);
 			data->req = NULL;
-- 
cgit v1.3


From 8b284dc47291daf72fe300e1138a2e7ed56f38ab Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 1 Oct 2013 16:44:53 +0200
Subject: fuse: writepages: handle same page rewrites

As Maxim Patlasov pointed out, it's possible to get a dirty page while it's
copy is still under writeback, despite fuse_page_mkwrite() doing its thing
(direct IO).

This could result in two concurrent write request for the same offset, with
data corruption if they get mixed up.

To prevent this, fuse needs to check and delay such writes.  This
implementation does this by:

 1. check if page is still under writeout, if so create a new, single page
    secondary request for it

 2. chain this secondary request onto the in-flight request

 2/a. if a seconday request for the same offset was already chained to the
    in-flight request, then just copy the contents of the page and discard
    the new secondary request.  This makes sure that for each page will
    have at most two requests associated with it

 3. when the in-flight request finished, send off all secondary requests
    chained onto it

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c   | 100 +++++++++++++++++++++++++++++++++++++++++++++++++------
 fs/fuse/fuse_i.h |   1 +
 2 files changed, 91 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index bf765cf7b112..f8ff019cc6ec 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1414,7 +1414,9 @@ static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
 
 	for (i = 0; i < req->num_pages; i++)
 		__free_page(req->pages[i]);
-	fuse_file_put(req->ff, false);
+
+	if (req->ff)
+		fuse_file_put(req->ff, false);
 }
 
 static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
@@ -1496,6 +1498,14 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
 
 	mapping_set_error(inode->i_mapping, req->out.h.error);
 	spin_lock(&fc->lock);
+	while (req->misc.write.next) {
+		struct fuse_req *next = req->misc.write.next;
+		req->misc.write.next = next->misc.write.next;
+		next->misc.write.next = NULL;
+		list_add(&next->writepages_entry, &fi->writepages);
+		list_add_tail(&next->list, &fi->queued_writes);
+		fuse_flush_writepages(inode);
+	}
 	fi->writectr--;
 	fuse_writepage_finish(fc, req);
 	spin_unlock(&fc->lock);
@@ -1548,6 +1558,7 @@ static int fuse_writepage_locked(struct page *page)
 
 	copy_highpage(tmp_page, page);
 	req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
+	req->misc.write.next = NULL;
 	req->in.argpages = 1;
 	req->num_pages = 1;
 	req->pages[0] = tmp_page;
@@ -1612,6 +1623,62 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data)
 		end_page_writeback(data->orig_pages[i]);
 }
 
+static bool fuse_writepage_in_flight(struct fuse_req *new_req,
+				     struct page *page)
+{
+	struct fuse_conn *fc = get_fuse_conn(new_req->inode);
+	struct fuse_inode *fi = get_fuse_inode(new_req->inode);
+	struct fuse_req *tmp;
+	struct fuse_req *old_req;
+	bool found = false;
+	pgoff_t curr_index;
+
+	BUG_ON(new_req->num_pages != 0);
+
+	spin_lock(&fc->lock);
+	list_del(&new_req->writepages_entry);
+	new_req->num_pages = 1;
+	list_for_each_entry(old_req, &fi->writepages, writepages_entry) {
+		BUG_ON(old_req->inode != new_req->inode);
+		curr_index = old_req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+		if (curr_index <= page->index &&
+		    page->index < curr_index + old_req->num_pages) {
+			found = true;
+			break;
+		}
+	}
+	if (!found)
+		goto out_unlock;
+
+	for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) {
+		BUG_ON(tmp->inode != new_req->inode);
+		curr_index = tmp->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+		if (tmp->num_pages == 1 &&
+		    curr_index == page->index) {
+			old_req = tmp;
+		}
+	}
+
+	if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
+					old_req->state == FUSE_REQ_PENDING)) {
+		copy_highpage(old_req->pages[0], page);
+		spin_unlock(&fc->lock);
+
+		dec_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK);
+		dec_zone_page_state(page, NR_WRITEBACK_TEMP);
+		fuse_writepage_free(fc, new_req);
+		fuse_request_free(new_req);
+		goto out;
+	} else {
+		new_req->misc.write.next = old_req->misc.write.next;
+		old_req->misc.write.next = new_req;
+	}
+out_unlock:
+	spin_unlock(&fc->lock);
+out:
+	return found;
+}
+
 static int fuse_writepages_fill(struct page *page,
 		struct writeback_control *wbc, void *_data)
 {
@@ -1620,6 +1687,7 @@ static int fuse_writepages_fill(struct page *page,
 	struct inode *inode = data->inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct page *tmp_page;
+	bool is_writeback;
 	int err;
 
 	if (!data->ff) {
@@ -1629,15 +1697,20 @@ static int fuse_writepages_fill(struct page *page,
 			goto out_unlock;
 	}
 
-	if (req) {
-		BUG_ON(!req->num_pages);
-		if (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
-		    (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_write ||
-		    data->orig_pages[req->num_pages - 1]->index + 1 != page->index) {
+	/*
+	 * Being under writeback is unlikely but possible.  For example direct
+	 * read to an mmaped fuse file will set the page dirty twice; once when
+	 * the pages are faulted with get_user_pages(), and then after the read
+	 * completed.
+	 */
+	is_writeback = fuse_page_is_writeback(inode, page->index);
 
-			fuse_writepages_send(data);
-			data->req = NULL;
-		}
+	if (req && req->num_pages &&
+	    (is_writeback || req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
+	     (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_write ||
+	     data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) {
+		fuse_writepages_send(data);
+		data->req = NULL;
 	}
 	err = -ENOMEM;
 	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
@@ -1669,6 +1742,7 @@ static int fuse_writepages_fill(struct page *page,
 
 		fuse_write_fill(req, data->ff, page_offset(page), 0);
 		req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
+		req->misc.write.next = NULL;
 		req->in.argpages = 1;
 		req->background = 1;
 		req->num_pages = 0;
@@ -1690,6 +1764,13 @@ static int fuse_writepages_fill(struct page *page,
 
 	inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK);
 	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
+
+	err = 0;
+	if (is_writeback && fuse_writepage_in_flight(req, page)) {
+		end_page_writeback(page);
+		data->req = NULL;
+		goto out_unlock;
+	}
 	data->orig_pages[req->num_pages] = page;
 
 	/*
@@ -1700,7 +1781,6 @@ static int fuse_writepages_fill(struct page *page,
 	req->num_pages++;
 	spin_unlock(&fc->lock);
 
-	err = 0;
 out_unlock:
 	unlock_page(page);
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 5b9e6f3b6aef..643274852c8b 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -321,6 +321,7 @@ struct fuse_req {
 		struct {
 			struct fuse_write_in in;
 			struct fuse_write_out out;
+			struct fuse_req *next;
 		} write;
 		struct fuse_notify_retrieve_in retrieve_in;
 		struct fuse_lk_in lk_in;
-- 
cgit v1.3


From ff17be0864777fe0dfb0a477868a8cb95c1ff90e Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 1 Oct 2013 16:44:53 +0200
Subject: fuse: writepage: skip already in flight

If ->writepage() tries to write back a page whose copy is still in flight,
then just skip by calling redirty_page_for_writepage().

This is OK, since now ->writepage() should never be called for data
integrity sync.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f8ff019cc6ec..135360e2b9e5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1591,6 +1591,18 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc)
 {
 	int err;
 
+	if (fuse_page_is_writeback(page->mapping->host, page->index)) {
+		/*
+		 * ->writepages() should be called for sync() and friends.  We
+		 * should only get here on direct reclaim and then we are
+		 * allowed to skip a page which is already in flight
+		 */
+		WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
+
+		redirty_page_for_writepage(wbc, page);
+		return 0;
+	}
+
 	err = fuse_writepage_locked(page);
 	unlock_page(page);
 
-- 
cgit v1.3


From cb2ffb26e67ef89c44f46e971440cda2f83ae236 Mon Sep 17 00:00:00 2001
From: Tom Gundersen <teg@jklm.no>
Date: Mon, 9 Sep 2013 20:18:27 +0200
Subject: cuse: add fix minor number to /dev/cuse

This allows udev (or more recently systemd-tmpfiles) to create /dev/cuse on
boot, in the same way as /dev/fuse is currently created, and the corresponding
module to be loaded on first access.

The corresponding functionalty was introduced for fuse in commit 578454f.

Signed-off-by: Tom Gundersen <teg@jklm.no>
Cc: Kay Sievers <kay@vrfy.org>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 Documentation/devices.txt  | 1 +
 fs/fuse/cuse.c             | 5 ++++-
 include/linux/miscdevice.h | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/Documentation/devices.txt b/Documentation/devices.txt
index 23721d3be3e6..80b72419ffd8 100644
--- a/Documentation/devices.txt
+++ b/Documentation/devices.txt
@@ -414,6 +414,7 @@ Your cooperation is appreciated.
 		200 = /dev/net/tun	TAP/TUN network device
 		201 = /dev/button/gulpb	Transmeta GULP-B buttons
 		202 = /dev/emd/ctl	Enhanced Metadisk RAID (EMD) control
+		203 = /dev/cuse		Cuse (character device in user-space)
 		204 = /dev/video/em8300		EM8300 DVD decoder control
 		205 = /dev/video/em8300_mv	EM8300 DVD decoder video
 		206 = /dev/video/em8300_ma	EM8300 DVD decoder audio
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index adbfd66b380f..24da581cb52b 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -589,11 +589,14 @@ static struct attribute *cuse_class_dev_attrs[] = {
 ATTRIBUTE_GROUPS(cuse_class_dev);
 
 static struct miscdevice cuse_miscdev = {
-	.minor		= MISC_DYNAMIC_MINOR,
+	.minor		= CUSE_MINOR,
 	.name		= "cuse",
 	.fops		= &cuse_channel_fops,
 };
 
+MODULE_ALIAS_MISCDEV(CUSE_MINOR);
+MODULE_ALIAS("devname:cuse");
+
 static int __init cuse_init(void)
 {
 	int i, rc;
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index 09c2300ddb37..c8cf093d8b71 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -31,6 +31,7 @@
 #define I2O_MINOR		166
 #define MICROCODE_MINOR		184
 #define TUN_MINOR		200
+#define CUSE_MINOR		203
 #define MWAVE_MINOR		219	/* ACP/Mwave Modem */
 #define MPT_MINOR		220
 #define MPT2SAS_MINOR		221
-- 
cgit v1.3


From aaaae98022efa4f3c31042f1fdf9e7a0c5f04663 Mon Sep 17 00:00:00 2001
From: Thierry Reding <thierry.reding@gmail.com>
Date: Tue, 1 Oct 2013 16:47:53 +0200
Subject: xfs: Use kmem_free() instead of free()

This fixes a build failure caused by calling the free() function which
does not exist in the Linux kernel.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_log_recover.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 43240583fd5b..39797490a1f1 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1585,7 +1585,7 @@ xlog_recover_add_to_trans(
 		"bad number of regions (%d) in inode log format",
 				  in_f->ilf_size);
 			ASSERT(0);
-			free(ptr);
+			kmem_free(ptr);
 			return XFS_ERROR(EIO);
 		}
 
-- 
cgit v1.3


From 0799a3e808e5543235bcdfe9d1fa1572e0e86ab5 Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Sun, 29 Sep 2013 18:56:04 +0800
Subject: xfs: get rid of count from xfs_iomap_write_allocate()

Get rid of function variable count from xfs_iomap_write_allocate() as
it is unused.

Additionally, checkpatch warn me of the following for this change:
WARNING: extern prototypes should be avoided in .h files
+extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,

So this patch also remove all extern function prototypes at xfs_iomap.h
to suppress it to make this code style in consistent manner in this file.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_aops.c  | 2 +-
 fs/xfs/xfs_iomap.c | 1 -
 fs/xfs/xfs_iomap.h | 8 ++++----
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e51e581454e9..f3f95f094af8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -333,7 +333,7 @@ xfs_map_blocks(
 
 	if (type == XFS_IO_DELALLOC &&
 	    (!nimaps || isnullstartblock(imap->br_startblock))) {
-		error = xfs_iomap_write_allocate(ip, offset, count, imap);
+		error = xfs_iomap_write_allocate(ip, offset, imap);
 		if (!error)
 			trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
 		return -XFS_ERROR(error);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 8d4d49b6fbf3..521fdf2ac6da 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -655,7 +655,6 @@ int
 xfs_iomap_write_allocate(
 	xfs_inode_t	*ip,
 	xfs_off_t	offset,
-	size_t		count,
 	xfs_bmbt_irec_t *imap)
 {
 	xfs_mount_t	*mp = ip->i_mount;
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 80615760959a..411fbb8919ef 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -21,12 +21,12 @@
 struct xfs_inode;
 struct xfs_bmbt_irec;
 
-extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
+int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
 			struct xfs_bmbt_irec *, int);
-extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
+int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
 			struct xfs_bmbt_irec *);
-extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
+int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
 			struct xfs_bmbt_irec *);
-extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
+int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
 
 #endif /* __XFS_IOMAP_H__*/
-- 
cgit v1.3


From a6f951ddbdfb7bd87d31a44f61abe202ed6ce57f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 1 Oct 2013 14:24:58 -0400
Subject: NFSv4: Fix a use-after-free situation in _nfs4_proc_getlk()

In nfs4_proc_getlk(), when some error causes a retry of the call to
_nfs4_proc_getlk(), we can end up with Oopses of the form

 BUG: unable to handle kernel NULL pointer dereference at 0000000000000134
 IP: [<ffffffff8165270e>] _raw_spin_lock+0xe/0x30
<snip>
 Call Trace:
  [<ffffffff812f287d>] _atomic_dec_and_lock+0x4d/0x70
  [<ffffffffa053c4f2>] nfs4_put_lock_state+0x32/0xb0 [nfsv4]
  [<ffffffffa053c585>] nfs4_fl_release_lock+0x15/0x20 [nfsv4]
  [<ffffffffa0522c06>] _nfs4_proc_getlk.isra.40+0x146/0x170 [nfsv4]
  [<ffffffffa052ad99>] nfs4_proc_lock+0x399/0x5a0 [nfsv4]

The problem is that we don't clear the request->fl_ops after the first
try and so when we retry, nfs4_set_lock_state() exits early without
setting the lock stateid.
Regression introduced by commit 70cc6487a4e08b8698c0e2ec935fb48d10490162
(locks: make ->lock release private data before returning in GETLK case)

Reported-by: Weston Andros Adamson <dros@netapp.com>
Reported-by: Jorge Mora <mora@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: <stable@vger.kernel.org> #2.6.22+
---
 fs/nfs/nfs4proc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d53d6785cba2..d2b4845ecb4d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5106,6 +5106,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
 			status = 0;
 	}
 	request->fl_ops->fl_release_private(request);
+	request->fl_ops = NULL;
 out:
 	return status;
 }
-- 
cgit v1.3


From 99875249bfbfb6d9a2aba020ce65da2862d0dafa Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 24 Sep 2013 12:06:07 -0400
Subject: NFSv4: Ensure that we disable the resend timeout for NFSv4

The spec states that the client should not resend requests because
the server will disconnect if it needs to drop an RPC request.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           | 2 ++
 fs/nfs/nfs4client.c       | 1 +
 include/linux/nfs_fs_sb.h | 1 +
 3 files changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2dceee4db076..af0325864df6 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -590,6 +590,8 @@ int nfs_create_rpc_client(struct nfs_client *clp,
 
 	if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags))
 		args.flags |= RPC_CLNT_CREATE_DISCRTRY;
+	if (test_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags))
+		args.flags |= RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT;
 	if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags))
 		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
 	if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags))
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index a860ab566d6e..511cdce6ecf2 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -368,6 +368,7 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
 	if (clp->cl_minorversion != 0)
 		__set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags);
 	__set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
+	__set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
 	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I);
 	if (error == -EINVAL)
 		error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index b8cedced50c9..f9c0a6cb41e9 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -41,6 +41,7 @@ struct nfs_client {
 #define NFS_CS_DISCRTRY		1		/* - disconnect on RPC retry */
 #define NFS_CS_MIGRATION	2		/* - transparent state migr */
 #define NFS_CS_INFINITE_SLOTS	3		/* - don't limit TCP slots */
+#define NFS_CS_NO_RETRANS_TIMEOUT	4	/* - Disable retransmit timeouts */
 	struct sockaddr_storage	cl_addr;	/* server identifier */
 	size_t			cl_addrlen;
 	char *			cl_hostname;	/* hostname of server */
-- 
cgit v1.3


From 17ec81c15fd022842f9bc947841ba9fb9eb52591 Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Sun, 22 Sep 2013 16:25:15 +0800
Subject: xfs: fix the wrong new_size/rnew_size at xfs_iext_realloc_direct()

At xfs_iext_realloc_direct(), the new_size is changed by adding
if_bytes if originally the extent records are stored at the inline
extent buffer, and we have to switch from it to a direct extent
list for those new allocated extents, this is wrong. e.g,

Create a file with three extents which was showing as following,

xfs_io -f -c "truncate 100m" /xfs/testme

for i in $(seq 0 5 10); do
	offset=$(($i * $((1 << 20))))
	xfs_io -c "pwrite $offset 1m" /xfs/testme
done

Inline
------
irec:	if_bytes	bytes_diff	new_size
1st	0		16		16
2nd	16		16		32

Switching
---------						rnew_size
3rd	32		16		48 + 32 = 80	roundup=128

In this case, the desired value of new_size should be 48, and then
it will be roundup to 64 and be assigned to rnew_size.

However, this issue has been covered by resetting the if_bytes to
the new_size which is calculated at the begnning of xfs_iext_add()
before leaving out this function, and in turn make the rnew_size
correctly again. Hence, this can not be detected via xfstestes.

This patch fix above problem and revise the new_size comments at
xfs_iext_realloc_direct() to make it more readable.  Also, fix the
comments while switching from the inline extent buffer to a direct
extent list to reflect this change.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_inode_fork.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c
index 02f1083955bb..4fa56fcb38d1 100644
--- a/fs/xfs/xfs_inode_fork.c
+++ b/fs/xfs/xfs_inode_fork.c
@@ -1359,7 +1359,7 @@ xfs_iext_remove_indirect(
 void
 xfs_iext_realloc_direct(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
-	int		new_size)	/* new size of extents */
+	int		new_size)	/* new size of extents after adding */
 {
 	int		rnew_size;	/* real new size of extents */
 
@@ -1397,13 +1397,8 @@ xfs_iext_realloc_direct(
 				rnew_size - ifp->if_real_bytes);
 		}
 	}
-	/*
-	 * Switch from the inline extent buffer to a direct
-	 * extent list. Be sure to include the inline extent
-	 * bytes in new_size.
-	 */
+	/* Switch from the inline extent buffer to a direct extent list */
 	else {
-		new_size += ifp->if_bytes;
 		if (!is_power_of_2(new_size)) {
 			rnew_size = roundup_pow_of_two(new_size);
 		}
-- 
cgit v1.3


From d948709b8e468510d8c855d1e4c9f1cb7b524880 Mon Sep 17 00:00:00 2001
From: Ben Myers <bpm@sgi.com>
Date: Tue, 10 Sep 2013 18:11:22 -0500
Subject: xfs: remove usage of is_bad_inode

XFS never calls mark_inode_bad or iget_failed, so it will never see a
bad inode.  Remove all checks for is_bad_inode because they are
unnecessary.

Signed-off-by: Ben Myers <bpm@sgi.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_icache.c | 7 -------
 fs/xfs/xfs_inode.c  | 2 +-
 fs/xfs/xfs_super.c  | 9 ---------
 3 files changed, 1 insertion(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 474807a401c8..76a4ecd638c3 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -500,11 +500,6 @@ xfs_inode_ag_walk_grab(
 	if (!igrab(inode))
 		return ENOENT;
 
-	if (is_bad_inode(inode)) {
-		IRELE(ip);
-		return ENOENT;
-	}
-
 	/* inode is valid */
 	return 0;
 
@@ -918,8 +913,6 @@ restart:
 		xfs_iflock(ip);
 	}
 
-	if (is_bad_inode(VFS_I(ip)))
-		goto reclaim;
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 		xfs_iunpin_wait(ip);
 		xfs_iflush_abort(ip, false);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index e3d75385aa76..5ac716b2df22 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1687,7 +1687,7 @@ xfs_inactive(
 	 * If the inode is already free, then there can be nothing
 	 * to clean up here.
 	 */
-	if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
+	if (ip->i_d.di_mode == 0) {
 		ASSERT(ip->i_df.if_real_bytes == 0);
 		ASSERT(ip->i_df.if_broot_bytes == 0);
 		return VN_INACTIVE_CACHE;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 15188cc99449..938e50fe6c61 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -946,10 +946,6 @@ xfs_fs_destroy_inode(
 
 	XFS_STATS_INC(vn_reclaim);
 
-	/* bad inode, get out here ASAP */
-	if (is_bad_inode(inode))
-		goto out_reclaim;
-
 	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
 
 	/*
@@ -965,7 +961,6 @@ xfs_fs_destroy_inode(
 	 * this more efficiently than we can here, so simply let background
 	 * reclaim tear down all inodes.
 	 */
-out_reclaim:
 	xfs_inode_set_reclaim_tag(ip);
 }
 
@@ -1491,10 +1486,6 @@ xfs_fs_fill_super(
 		error = ENOENT;
 		goto out_unmount;
 	}
-	if (is_bad_inode(root)) {
-		error = EINVAL;
-		goto out_unmount;
-	}
 	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
 		error = ENOMEM;
-- 
cgit v1.3


From 7b9cff467144c8c62268db1b0948df089caa0999 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 2 Oct 2013 11:13:25 +0100
Subject: GFS2: Add allocation parameters structure

This patch adds a structure to contain allocation parameters with
the intention of future expansion of this structure. The idea is
that we should be able to add more information about the allocation
in the future in order to allow the allocator to make a better job
of placing the requests on-disk.

There is no functional difference from applying this patch.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/aops.c   |  4 +++-
 fs/gfs2/bmap.c   |  3 ++-
 fs/gfs2/file.c   |  8 ++++++--
 fs/gfs2/incore.h | 14 ++++++++++++++
 fs/gfs2/inode.c  | 12 ++++++++----
 fs/gfs2/quota.c  |  8 ++++++--
 fs/gfs2/rgrp.c   | 18 +++++++++---------
 fs/gfs2/rgrp.h   |  2 +-
 fs/gfs2/xattr.c  |  3 ++-
 9 files changed, 51 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 1f7d8057ea68..b7fc035a6943 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -611,12 +611,14 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 		gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
 
 	if (alloc_required) {
+		struct gfs2_alloc_parms ap = { .aflags = 0, };
 		error = gfs2_quota_lock_check(ip);
 		if (error)
 			goto out_unlock;
 
 		requested = data_blocks + ind_blocks;
-		error = gfs2_inplace_reserve(ip, requested, 0);
+		ap.target = requested;
+		error = gfs2_inplace_reserve(ip, &ap);
 		if (error)
 			goto out_qunlock;
 	}
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 21ad0f11cad4..fe0500c0af7a 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1216,6 +1216,7 @@ static int do_grow(struct inode *inode, u64 size)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_alloc_parms ap = { .target = 1, };
 	struct buffer_head *dibh;
 	int error;
 	int unstuff = 0;
@@ -1226,7 +1227,7 @@ static int do_grow(struct inode *inode, u64 size)
 		if (error)
 			return error;
 
-		error = gfs2_inplace_reserve(ip, 1, 0);
+		error = gfs2_inplace_reserve(ip, &ap);
 		if (error)
 			goto do_grow_qunlock;
 		unstuff = 1;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 9ad20edc9c27..efc078f0ee4e 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -383,6 +383,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct inode *inode = file_inode(vma->vm_file);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_alloc_parms ap = { .aflags = 0, };
 	unsigned long last_index;
 	u64 pos = page->index << PAGE_CACHE_SHIFT;
 	unsigned int data_blocks, ind_blocks, rblocks;
@@ -430,7 +431,8 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (ret)
 		goto out_unlock;
 	gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
-	ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0);
+	ap.target = data_blocks + ind_blocks;
+	ret = gfs2_inplace_reserve(ip, &ap);
 	if (ret)
 		goto out_quota_unlock;
 
@@ -800,6 +802,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
 	struct inode *inode = file_inode(file);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_alloc_parms ap = { .aflags = 0, };
 	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
 	loff_t bytes, max_bytes;
 	int error;
@@ -850,7 +853,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
 retry:
 		gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
 
-		error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0);
+		ap.target = data_blocks + ind_blocks;
+		error = gfs2_inplace_reserve(ip, &ap);
 		if (error) {
 			if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
 				bytes >>= 1;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 8c8f110d8e35..082c8fa7fab9 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -285,6 +285,20 @@ struct gfs2_blkreserv {
 	unsigned int rs_qa_qd_num;
 };
 
+/*
+ * Allocation parameters
+ * @target: The number of blocks we'd ideally like to allocate
+ * @aflags: The flags (e.g. Orlov flag)
+ *
+ * The intent is to gradually expand this structure over time in
+ * order to give more information, e.g. alignment, min extent size
+ * to the allocation code.
+ */
+struct gfs2_alloc_parms {
+	u32 target;
+	u32 aflags;
+};
+
 enum {
 	GLF_LOCK			= 1,
 	GLF_DEMOTE			= 3,
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 4b79c19100d2..5a7ca3d1d1cf 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -379,6 +379,7 @@ static void munge_mode_uid_gid(const struct gfs2_inode *dip,
 static int alloc_dinode(struct gfs2_inode *ip, u32 flags)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc_parms ap = { .target = RES_DINODE, .aflags = flags, };
 	int error;
 	int dblocks = 1;
 
@@ -386,7 +387,7 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags)
 	if (error)
 		goto out;
 
-	error = gfs2_inplace_reserve(ip, RES_DINODE, flags);
+	error = gfs2_inplace_reserve(ip, &ap);
 	if (error)
 		goto out_quota;
 
@@ -472,6 +473,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 		       struct gfs2_inode *ip, int arq)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, };
 	int error;
 
 	if (arq) {
@@ -479,7 +481,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 		if (error)
 			goto fail_quota_locks;
 
-		error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres, 0);
+		error = gfs2_inplace_reserve(dip, &ap);
 		if (error)
 			goto fail_quota_locks;
 
@@ -874,11 +876,12 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	error = 0;
 
 	if (alloc_required) {
+		struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, };
 		error = gfs2_quota_lock_check(dip);
 		if (error)
 			goto out_gunlock;
 
-		error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres, 0);
+		error = gfs2_inplace_reserve(dip, &ap);
 		if (error)
 			goto out_gunlock_q;
 
@@ -1387,11 +1390,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 		goto out_gunlock;
 
 	if (alloc_required) {
+		struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, };
 		error = gfs2_quota_lock_check(ndip);
 		if (error)
 			goto out_gunlock;
 
-		error = gfs2_inplace_reserve(ndip, sdp->sd_max_dirres, 0);
+		error = gfs2_inplace_reserve(ndip, &ap);
 		if (error)
 			goto out_gunlock_q;
 
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index db441359ee8c..8fe7a0a87c80 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -763,6 +763,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 {
 	struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd;
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+	struct gfs2_alloc_parms ap = { .aflags = 0, };
 	unsigned int data_blocks, ind_blocks;
 	struct gfs2_holder *ghs, i_gh;
 	unsigned int qx, x;
@@ -815,7 +816,8 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 	blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3;
 
 	reserved = 1 + (nalloc * (data_blocks + ind_blocks));
-	error = gfs2_inplace_reserve(ip, reserved, 0);
+	ap.target = reserved;
+	error = gfs2_inplace_reserve(ip, &ap);
 	if (error)
 		goto out_alloc;
 
@@ -1573,10 +1575,12 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
 	if (gfs2_is_stuffed(ip))
 		alloc_required = 1;
 	if (alloc_required) {
+		struct gfs2_alloc_parms ap = { .aflags = 0, };
 		gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
 				       &data_blocks, &ind_blocks);
 		blocks = 1 + data_blocks + ind_blocks;
-		error = gfs2_inplace_reserve(ip, blocks, 0);
+		ap.target = blocks;
+		error = gfs2_inplace_reserve(ip, &ap);
 		if (error)
 			goto out_i;
 		blocks += gfs2_rg_blocks(ip, blocks);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index d4d10fadab79..4f0984a607b3 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1422,12 +1422,12 @@ static void rs_insert(struct gfs2_inode *ip)
  * rg_mblk_search - find a group of multiple free blocks to form a reservation
  * @rgd: the resource group descriptor
  * @ip: pointer to the inode for which we're reserving blocks
- * @requested: number of blocks required for this allocation
+ * @ap: the allocation parameters
  *
  */
 
 static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
-			   unsigned requested)
+			   const struct gfs2_alloc_parms *ap)
 {
 	struct gfs2_rbm rbm = { .rgd = rgd, };
 	u64 goal;
@@ -1440,7 +1440,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
 	if (S_ISDIR(inode->i_mode))
 		extlen = 1;
 	else {
-		extlen = max_t(u32, atomic_read(&rs->rs_sizehint), requested);
+		extlen = max_t(u32, atomic_read(&rs->rs_sizehint), ap->target);
 		extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks);
 	}
 	if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
@@ -1831,12 +1831,12 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *b
 /**
  * gfs2_inplace_reserve - Reserve space in the filesystem
  * @ip: the inode to reserve space for
- * @requested: the number of blocks to be reserved
+ * @ap: the allocation parameters
  *
  * Returns: errno
  */
 
-int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags)
+int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_rgrpd *begin = NULL;
@@ -1848,7 +1848,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags)
 
 	if (sdp->sd_args.ar_rgrplvb)
 		flags |= GL_SKIP;
-	if (gfs2_assert_warn(sdp, requested))
+	if (gfs2_assert_warn(sdp, ap->target))
 		return -EINVAL;
 	if (gfs2_rs_active(rs)) {
 		begin = rs->rs_rbm.rgd;
@@ -1857,7 +1857,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags)
 	} else {
 		rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
 	}
-	if (S_ISDIR(ip->i_inode.i_mode) && (aflags & GFS2_AF_ORLOV))
+	if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
 		skip = gfs2_orlov_skip(ip);
 	if (rs->rs_rbm.rgd == NULL)
 		return -EBADSLT;
@@ -1899,14 +1899,14 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags)
 
 		/* Get a reservation if we don't already have one */
 		if (!gfs2_rs_active(rs))
-			rg_mblk_search(rs->rs_rbm.rgd, ip, requested);
+			rg_mblk_search(rs->rs_rbm.rgd, ip, ap);
 
 		/* Skip rgrps when we can't get a reservation on first pass */
 		if (!gfs2_rs_active(rs) && (loops < 1))
 			goto check_rgrp;
 
 		/* If rgrp has enough free space, use it */
-		if (rs->rs_rbm.rgd->rd_free_clone >= requested) {
+		if (rs->rs_rbm.rgd->rd_free_clone >= ap->target) {
 			ip->i_rgd = rs->rs_rbm.rgd;
 			return 0;
 		}
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 57ea16ba3414..3a10d2ffbbe7 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -40,7 +40,7 @@ extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
 extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
 
 #define GFS2_AF_ORLOV 1
-extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 flags);
+extern int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap);
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
 
 extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index ecd37f30ab91..8c6a6f6bdba9 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -723,6 +723,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 			     unsigned int blks,
 			     ea_skeleton_call_t skeleton_call, void *private)
 {
+	struct gfs2_alloc_parms ap = { .target = blks };
 	struct buffer_head *dibh;
 	int error;
 
@@ -734,7 +735,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	if (error)
 		return error;
 
-	error = gfs2_inplace_reserve(ip, blks, 0);
+	error = gfs2_inplace_reserve(ip, &ap);
 	if (error)
 		goto out_gunlock_q;
 
-- 
cgit v1.3


From 9e07f2cb3d7a93f4b1b18fc6e4dd6911dcba2442 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 2 Oct 2013 14:42:45 +0100
Subject: GFS2: Speed up starting point selection for block allocation

When setting the starting point for block allocation, there were calls
to both gfs2_rbm_to_block() and gfs2_rbm_from_block() in the common case
of there being an active reservation. The gfs2_rbm_from_block() function
can be quite slow, and since the two conversions were effectively a
no-op, it makes sense to avoid them entirely in this case.

There is no functional change here, but the code should be a bit more
efficient after this patch.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 41 +++++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 4f0984a607b3..4d83abdd5635 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -2133,6 +2133,35 @@ out:
 	spin_unlock(&rgd->rd_rsspin);
 }
 
+/**
+ * gfs2_set_alloc_start - Set starting point for block allocation
+ * @rbm: The rbm which will be set to the required location
+ * @ip: The gfs2 inode
+ * @dinode: Flag to say if allocation includes a new inode
+ *
+ * This sets the starting point from the reservation if one is active
+ * otherwise it falls back to guessing a start point based on the
+ * inode's goal block or the last allocation point in the rgrp.
+ */
+
+static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
+				 const struct gfs2_inode *ip, bool dinode)
+{
+	u64 goal;
+
+	if (gfs2_rs_active(ip->i_res)) {
+		*rbm = ip->i_res->rs_rbm;
+		return;
+	}
+
+	if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
+		goal = ip->i_goal;
+	else
+		goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
+
+	gfs2_rbm_from_block(rbm, goal);
+}
+
 /**
  * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
  * @ip: the inode to allocate the block for
@@ -2151,22 +2180,14 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 	struct buffer_head *dibh;
 	struct gfs2_rbm rbm = { .rgd = ip->i_rgd, };
 	unsigned int ndata;
-	u64 goal;
 	u64 block; /* block, within the file system scope */
 	int error;
 
-	if (gfs2_rs_active(ip->i_res))
-		goal = gfs2_rbm_to_block(&ip->i_res->rs_rbm);
-	else if (!dinode && rgrp_contains_block(rbm.rgd, ip->i_goal))
-		goal = ip->i_goal;
-	else
-		goal = rbm.rgd->rd_last_alloc + rbm.rgd->rd_data0;
-
-	gfs2_rbm_from_block(&rbm, goal);
+	gfs2_set_alloc_start(&rbm, ip, dinode);
 	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, ip, false);
 
 	if (error == -ENOSPC) {
-		gfs2_rbm_from_block(&rbm, goal);
+		gfs2_set_alloc_start(&rbm, ip, dinode);
 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, NULL, false);
 	}
 
-- 
cgit v1.3


From 26e43a15d44a02e380904ac6113fe0f2f1ddea8a Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 2 Oct 2013 14:47:02 +0100
Subject: GFS2: Move gfs2_icbit_munge into quota.c

This function is only called twice, and both callers are
quota related, so lets move this function into quota.c and
make it static.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/quota.c | 20 ++++++++++++++++++++
 fs/gfs2/util.c  | 20 --------------------
 fs/gfs2/util.h  |  2 --
 3 files changed, 20 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 8fe7a0a87c80..fd1ec5243d2e 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -289,6 +289,26 @@ static void slot_hold(struct gfs2_quota_data *qd)
 	spin_unlock(&qd_lru_lock);
 }
 
+static void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
+			     unsigned int bit, int new_value)
+{
+	unsigned int c, o, b = bit;
+	int old_value;
+
+	c = b / (8 * PAGE_SIZE);
+	b %= 8 * PAGE_SIZE;
+	o = b / 8;
+	b %= 8;
+
+	old_value = (bitmap[c][o] & (1 << b));
+	gfs2_assert_withdraw(sdp, !old_value != !new_value);
+
+	if (new_value)
+		bitmap[c][o] |= 1 << b;
+	else
+		bitmap[c][o] &= ~(1 << b);
+}
+
 static void slot_put(struct gfs2_quota_data *qd)
 {
 	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 6402fb69d71b..f7109f689e61 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -268,23 +268,3 @@ int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
 	return rv;
 }
 
-void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
-		      unsigned int bit, int new_value)
-{
-	unsigned int c, o, b = bit;
-	int old_value;
-
-	c = b / (8 * PAGE_SIZE);
-	b %= 8 * PAGE_SIZE;
-	o = b / 8;
-	b %= 8;
-
-	old_value = (bitmap[c][o] & (1 << b));
-	gfs2_assert_withdraw(sdp, !old_value != !new_value);
-
-	if (new_value)
-		bitmap[c][o] |= 1 << b;
-	else
-		bitmap[c][o] &= ~(1 << b);
-}
-
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 80535739ac7b..b7ffb09b99ea 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -164,8 +164,6 @@ static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
 #define gfs2_tune_get(sdp, field) \
 gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
 
-void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
-		      unsigned int bit, int new_value);
 int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...);
 
 #endif /* __UTIL_DOT_H__ */
-- 
cgit v1.3


From 97e47fa11deae10ae72aa1109e700565c57152f5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 16 Sep 2013 10:57:01 -0400
Subject: nfsd: switch to %p[dD]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/nfs4recover.c | 12 ++++++------
 fs/nfsd/nfs4state.c   | 15 ++++++---------
 fs/nfsd/nfsfh.c       | 28 +++++++++++++---------------
 fs/nfsd/nfsfh.h       |  4 ++--
 fs/nfsd/vfs.c         |  9 ++++-----
 5 files changed, 31 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index e0a65a9e37e9..9c271f42604a 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -385,8 +385,8 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
 
 	status = vfs_rmdir(parent->d_inode, child);
 	if (status)
-		printk("failed to remove client recovery directory %s\n",
-				child->d_name.name);
+		printk("failed to remove client recovery directory %pd\n",
+				child);
 	/* Keep trying, success or failure: */
 	return 0;
 }
@@ -410,15 +410,15 @@ out:
 	nfs4_release_reclaim(nn);
 	if (status)
 		printk("nfsd4: failed to purge old clients from recovery"
-			" directory %s\n", nn->rec_file->f_path.dentry->d_name.name);
+			" directory %pD\n", nn->rec_file);
 }
 
 static int
 load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
 {
 	if (child->d_name.len != HEXDIR_LEN - 1) {
-		printk("nfsd4: illegal name %s in recovery directory\n",
-				child->d_name.name);
+		printk("nfsd4: illegal name %pd in recovery directory\n",
+				child);
 		/* Keep trying; maybe the others are OK: */
 		return 0;
 	}
@@ -437,7 +437,7 @@ nfsd4_recdir_load(struct net *net) {
 	status = nfsd4_list_rec_dir(load_recdir, nn);
 	if (status)
 		printk("nfsd4: failed loading clients from recovery"
-			" directory %s\n", nn->rec_file->f_path.dentry->d_name.name);
+			" directory %pD\n", nn->rec_file);
 	return status;
 }
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0874998a49cd..a601fd49f997 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3843,9 +3843,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct nfs4_ol_stateid *stp;
 	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 
-	dprintk("NFSD: nfsd4_open_confirm on file %.*s\n",
-			(int)cstate->current_fh.fh_dentry->d_name.len,
-			cstate->current_fh.fh_dentry->d_name.name);
+	dprintk("NFSD: nfsd4_open_confirm on file %pd\n",
+			cstate->current_fh.fh_dentry);
 
 	status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0);
 	if (status)
@@ -3922,9 +3921,8 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 	struct nfs4_ol_stateid *stp;
 	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 
-	dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n", 
-			(int)cstate->current_fh.fh_dentry->d_name.len,
-			cstate->current_fh.fh_dentry->d_name.name);
+	dprintk("NFSD: nfsd4_open_downgrade on file %pd\n", 
+			cstate->current_fh.fh_dentry);
 
 	/* We don't yet support WANT bits: */
 	if (od->od_deleg_want)
@@ -3980,9 +3978,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct net *net = SVC_NET(rqstp);
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
-	dprintk("NFSD: nfsd4_close on file %.*s\n", 
-			(int)cstate->current_fh.fh_dentry->d_name.len,
-			cstate->current_fh.fh_dentry->d_name.name);
+	dprintk("NFSD: nfsd4_close on file %pd\n", 
+			cstate->current_fh.fh_dentry);
 
 	nfs4_lock_state();
 	status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 814afaa4458a..3d0e15ae6f72 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -47,7 +47,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
 		tdentry = parent;
 	}
 	if (tdentry != exp->ex_path.dentry)
-		dprintk("nfsd_acceptable failed at %p %s\n", tdentry, tdentry->d_name.name);
+		dprintk("nfsd_acceptable failed at %p %pd\n", tdentry, tdentry);
 	rv = (tdentry == exp->ex_path.dentry);
 	dput(tdentry);
 	return rv;
@@ -253,8 +253,8 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
 
 	if (S_ISDIR(dentry->d_inode->i_mode) &&
 			(dentry->d_flags & DCACHE_DISCONNECTED)) {
-		printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %s/%s\n",
-				dentry->d_parent->d_name.name, dentry->d_name.name);
+		printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %pd2\n",
+				dentry);
 	}
 
 	fhp->fh_dentry = dentry;
@@ -361,10 +361,9 @@ skip_pseudoflavor_check:
 	error = nfsd_permission(rqstp, exp, dentry, access);
 
 	if (error) {
-		dprintk("fh_verify: %s/%s permission failure, "
+		dprintk("fh_verify: %pd2 permission failure, "
 			"acc=%x, error=%d\n",
-			dentry->d_parent->d_name.name,
-			dentry->d_name.name,
+			dentry,
 			access, ntohl(error));
 	}
 out:
@@ -514,14 +513,13 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
 	 */
 
 	struct inode * inode = dentry->d_inode;
-	struct dentry *parent = dentry->d_parent;
 	__u32 *datap;
 	dev_t ex_dev = exp_sb(exp)->s_dev;
 
-	dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %s/%s, ino=%ld)\n",
+	dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %pd2, ino=%ld)\n",
 		MAJOR(ex_dev), MINOR(ex_dev),
 		(long) exp->ex_path.dentry->d_inode->i_ino,
-		parent->d_name.name, dentry->d_name.name,
+		dentry,
 		(inode ? inode->i_ino : 0));
 
 	/* Choose filehandle version and fsid type based on
@@ -534,13 +532,13 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
 		fh_put(ref_fh);
 
 	if (fhp->fh_locked || fhp->fh_dentry) {
-		printk(KERN_ERR "fh_compose: fh %s/%s not initialized!\n",
-		       parent->d_name.name, dentry->d_name.name);
+		printk(KERN_ERR "fh_compose: fh %pd2 not initialized!\n",
+		       dentry);
 	}
 	if (fhp->fh_maxsize < NFS_FHSIZE)
-		printk(KERN_ERR "fh_compose: called with maxsize %d! %s/%s\n",
+		printk(KERN_ERR "fh_compose: called with maxsize %d! %pd2\n",
 		       fhp->fh_maxsize,
-		       parent->d_name.name, dentry->d_name.name);
+		       dentry);
 
 	fhp->fh_dentry = dget(dentry); /* our internal copy */
 	fhp->fh_export = exp;
@@ -613,8 +611,8 @@ out_bad:
 	printk(KERN_ERR "fh_update: fh not verified!\n");
 	goto out;
 out_negative:
-	printk(KERN_ERR "fh_update: %s/%s still negative!\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name);
+	printk(KERN_ERR "fh_update: %pd2 still negative!\n",
+		dentry);
 	goto out;
 }
 
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index e5e6707ba687..4775bc4896c8 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -173,8 +173,8 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
 	BUG_ON(!dentry);
 
 	if (fhp->fh_locked) {
-		printk(KERN_WARNING "fh_lock: %s/%s already locked!\n",
-			dentry->d_parent->d_name.name, dentry->d_name.name);
+		printk(KERN_WARNING "fh_lock: %pd2 already locked!\n",
+			dentry);
 		return;
 	}
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c827acb0e943..13886f7f40d5 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1317,9 +1317,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		if (!fhp->fh_locked) {
 			/* not actually possible */
 			printk(KERN_ERR
-				"nfsd_create: parent %s/%s not locked!\n",
-				dentry->d_parent->d_name.name,
-				dentry->d_name.name);
+				"nfsd_create: parent %pd2 not locked!\n",
+				dentry);
 			err = nfserr_io;
 			goto out;
 		}
@@ -1329,8 +1328,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	 */
 	err = nfserr_exist;
 	if (dchild->d_inode) {
-		dprintk("nfsd_create: dentry %s/%s not negative!\n",
-			dentry->d_name.name, dchild->d_name.name);
+		dprintk("nfsd_create: dentry %pd/%pd not negative!\n",
+			dentry, dchild);
 		goto out; 
 	}
 
-- 
cgit v1.3


From d69ac5a0bbcf1d9962883fb23e337caf5b38cec8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 18 Sep 2013 17:15:35 -0400
Subject: sysfs: remove sysfs_addrm_cxt->parent_sd

sysfs_addrm_start/finish() enclose sysfs_dirent additions and
deletions and sysfs_addrm_cxt is used to record information necessary
to finish the operations.  Currently, sysfs_addrm_start() takes
@parent_sd, records it in sysfs_addrm_cxt, and assumes that all
operations in the block are performed under that @parent_sd.

This assumption has been fine until now but we want to make some
operations behave recursively and, while having @parent_sd recorded in
sysfs_addrm_cxt doesn't necessarily prevents that, it becomes
confusing.

This patch removes sysfs_addrm_cxt->parent_sd and makes
sysfs_add_one() take an explicit @parent_sd parameter.  Note that
sysfs_remove_one() doesn't need the extra argument as its parent is
always known from the target @sd.

While at it, add __acquires/releases() notations to
sysfs_addrm_start/finish() respectively.

This patch doesn't make any functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c     | 52 +++++++++++++++++++++++++++-------------------------
 fs/sysfs/file.c    |  4 ++--
 fs/sysfs/inode.c   |  2 +-
 fs/sysfs/symlink.c |  6 +++---
 fs/sysfs/sysfs.h   | 10 +++++-----
 5 files changed, 38 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index d23e66dfba74..671868914b5b 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -406,22 +406,19 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
 /**
  *	sysfs_addrm_start - prepare for sysfs_dirent add/remove
  *	@acxt: pointer to sysfs_addrm_cxt to be used
- *	@parent_sd: parent sysfs_dirent
  *
- *	This function is called when the caller is about to add or
- *	remove sysfs_dirent under @parent_sd.  This function acquires
- *	sysfs_mutex.  @acxt is used to keep and pass context to
- *	other addrm functions.
+ *	This function is called when the caller is about to add or remove
+ *	sysfs_dirent.  This function acquires sysfs_mutex.  @acxt is used
+ *	to keep and pass context to other addrm functions.
  *
  *	LOCKING:
  *	Kernel thread context (may sleep).  sysfs_mutex is locked on
  *	return.
  */
-void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
-		       struct sysfs_dirent *parent_sd)
+void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt)
+	__acquires(sysfs_mutex)
 {
 	memset(acxt, 0, sizeof(*acxt));
-	acxt->parent_sd = parent_sd;
 
 	mutex_lock(&sysfs_mutex);
 }
@@ -430,10 +427,11 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
  *	__sysfs_add_one - add sysfs_dirent to parent without warning
  *	@acxt: addrm context to use
  *	@sd: sysfs_dirent to be added
+ *	@parent_sd: the parent sysfs_dirent to add @sd to
  *
- *	Get @acxt->parent_sd and set sd->s_parent to it and increment
- *	nlink of parent inode if @sd is a directory and link into the
- *	children list of the parent.
+ *	Get @parent_sd and set @sd->s_parent to it and increment nlink of
+ *	the parent inode if @sd is a directory and link into the children
+ *	list of the parent.
  *
  *	This function should be called between calls to
  *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
@@ -446,20 +444,21 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
  *	0 on success, -EEXIST if entry with the given name already
  *	exists.
  */
-int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
+int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
+		    struct sysfs_dirent *parent_sd)
 {
 	struct sysfs_inode_attrs *ps_iattr;
 	int ret;
 
 	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
-	sd->s_parent = sysfs_get(acxt->parent_sd);
+	sd->s_parent = sysfs_get(parent_sd);
 
 	ret = sysfs_link_sibling(sd);
 	if (ret)
 		return ret;
 
 	/* Update timestamps on the parent */
-	ps_iattr = acxt->parent_sd->s_iattr;
+	ps_iattr = parent_sd->s_iattr;
 	if (ps_iattr) {
 		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
 		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
@@ -493,10 +492,11 @@ static char *sysfs_pathname(struct sysfs_dirent *sd, char *path)
  *	sysfs_add_one - add sysfs_dirent to parent
  *	@acxt: addrm context to use
  *	@sd: sysfs_dirent to be added
+ *	@parent_sd: the parent sysfs_dirent to add @sd to
  *
- *	Get @acxt->parent_sd and set sd->s_parent to it and increment
- *	nlink of parent inode if @sd is a directory and link into the
- *	children list of the parent.
+ *	Get @parent_sd and set @sd->s_parent to it and increment nlink of
+ *	the parent inode if @sd is a directory and link into the children
+ *	list of the parent.
  *
  *	This function should be called between calls to
  *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
@@ -509,17 +509,18 @@ static char *sysfs_pathname(struct sysfs_dirent *sd, char *path)
  *	0 on success, -EEXIST if entry with the given name already
  *	exists.
  */
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
+int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
+		  struct sysfs_dirent *parent_sd)
 {
 	int ret;
 
-	ret = __sysfs_add_one(acxt, sd);
+	ret = __sysfs_add_one(acxt, sd, parent_sd);
 	if (ret == -EEXIST) {
 		char *path = kzalloc(PATH_MAX, GFP_KERNEL);
 		WARN(1, KERN_WARNING
 		     "sysfs: cannot create duplicate filename '%s'\n",
 		     (path == NULL) ? sd->s_name
-				    : (sysfs_pathname(acxt->parent_sd, path),
+				    : (sysfs_pathname(parent_sd, path),
 				       strlcat(path, "/", PATH_MAX),
 				       strlcat(path, sd->s_name, PATH_MAX),
 				       path));
@@ -553,7 +554,7 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 	sysfs_unlink_sibling(sd);
 
 	/* Update timestamps on the parent */
-	ps_iattr = acxt->parent_sd->s_iattr;
+	ps_iattr = sd->s_parent->s_iattr;
 	if (ps_iattr) {
 		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
 		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
@@ -576,6 +577,7 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
  *	sysfs_mutex is released.
  */
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
+	__releases(sysfs_mutex)
 {
 	/* release resources acquired by sysfs_addrm_start() */
 	mutex_unlock(&sysfs_mutex);
@@ -678,8 +680,8 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 	sd->s_dir.kobj = kobj;
 
 	/* link in */
-	sysfs_addrm_start(&acxt, parent_sd);
-	rc = sysfs_add_one(&acxt, sd);
+	sysfs_addrm_start(&acxt);
+	rc = sysfs_add_one(&acxt, sd, parent_sd);
 	sysfs_addrm_finish(&acxt);
 
 	if (rc == 0)
@@ -772,7 +774,7 @@ static void remove_dir(struct sysfs_dirent *sd)
 {
 	struct sysfs_addrm_cxt acxt;
 
-	sysfs_addrm_start(&acxt, sd->s_parent);
+	sysfs_addrm_start(&acxt);
 	sysfs_remove_one(&acxt, sd);
 	sysfs_addrm_finish(&acxt);
 }
@@ -792,7 +794,7 @@ static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
 		return;
 
 	pr_debug("sysfs %s: removing dir\n", dir_sd->s_name);
-	sysfs_addrm_start(&acxt, dir_sd);
+	sysfs_addrm_start(&acxt);
 	pos = rb_first(&dir_sd->s_dir.children);
 	while (pos) {
 		struct sysfs_dirent *sd = to_sysfs_dirent(pos);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 4697019fafa3..1656a79ea6c0 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -502,8 +502,8 @@ int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 	sd->s_attr.attr = (void *)attr;
 	sysfs_dirent_init_lockdep(sd);
 
-	sysfs_addrm_start(&acxt, dir_sd);
-	rc = sysfs_add_one(&acxt, sd);
+	sysfs_addrm_start(&acxt);
+	rc = sysfs_add_one(&acxt, sd, dir_sd);
 	sysfs_addrm_finish(&acxt);
 
 	if (rc)
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 07193d720d92..364c8873fbda 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -326,7 +326,7 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
 		return -ENOENT;
 	}
 
-	sysfs_addrm_start(&acxt, dir_sd);
+	sysfs_addrm_start(&acxt);
 
 	sd = sysfs_find_dirent(dir_sd, name, ns);
 	if (sd)
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 88c8bc5e8911..22ea2f5796f5 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -53,11 +53,11 @@ static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
 	sd->s_symlink.target_sd = target_sd;
 	target_sd = NULL;	/* reference is now owned by the symlink */
 
-	sysfs_addrm_start(&acxt, parent_sd);
+	sysfs_addrm_start(&acxt);
 	if (warn)
-		error = sysfs_add_one(&acxt, sd);
+		error = sysfs_add_one(&acxt, sd, parent_sd);
 	else
-		error = __sysfs_add_one(&acxt, sd);
+		error = __sysfs_add_one(&acxt, sd, parent_sd);
 	sysfs_addrm_finish(&acxt);
 
 	if (error)
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index ee44fde199d0..4d1154411cdb 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -120,7 +120,6 @@ do {								\
  * Context structure to be used while adding/removing nodes.
  */
 struct sysfs_addrm_cxt {
-	struct sysfs_dirent	*parent_sd;
 	struct sysfs_dirent	*removed;
 };
 
@@ -154,10 +153,11 @@ extern const struct inode_operations sysfs_dir_inode_operations;
 struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd);
 struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
 void sysfs_put_active(struct sysfs_dirent *sd);
-void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
-		       struct sysfs_dirent *parent_sd);
-int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
+void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt);
+int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
+		    struct sysfs_dirent *parent_sd);
+int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
+		  struct sysfs_dirent *parent_sd);
 void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
 
-- 
cgit v1.3


From 26ea12dec0c84133add937455be76d44fe253d85 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 18 Sep 2013 17:15:36 -0400
Subject: kobject: grab an extra reference on kobject->sd to allow duplicate
 deletes

sysfs currently has a rather weird behavior regarding removals.  A
directory removal would delete all files directly under it but
wouldn't recurse into subdirectories, which, while a bit inconsistent,
seems to make sense at the first glance as each directory is
supposedly associated with a kobject and each kobject can take care of
the directory deletion; however, this doesn't really hold as we have
groups which can be directories without a kobject associated with it
and require explicit deletions.

We're in the process of separating out sysfs from kboject / driver
core and want a consistent behavior.  A removal should delete either
only the specified node or everything under it.  I think it is helpful
to support recursive atomic removal and later patches will implement
it.

Such change means that a sysfs_dirent associated with kobject may be
deleted before the kobject itself is removed if one of its ancestor
gets removed before it.  As sysfs_remove_dir() puts the base ref, we
may end up with dangling pointer on descendants.  This can be solved
by holding an extra reference on the sd from kobject.

Acquire an extra reference on the associated sysfs_dirent on directory
creation and put it after removal.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c |  7 ++++++-
 lib/kobject.c  | 12 ++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 671868914b5b..105a7e2d1660 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -549,7 +549,12 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
 	struct sysfs_inode_attrs *ps_iattr;
 
-	BUG_ON(sd->s_flags & SYSFS_FLAG_REMOVED);
+	/*
+	 * Removal can be called multiple times on the same node.  Only the
+	 * first invocation is effective and puts the base ref.
+	 */
+	if (sd->s_flags & SYSFS_FLAG_REMOVED)
+		return;
 
 	sysfs_unlink_sibling(sd);
 
diff --git a/lib/kobject.c b/lib/kobject.c
index 151089788c21..2fdf7fa9e9bd 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -76,6 +76,13 @@ static int create_dir(struct kobject *kobj)
 		if (error)
 			sysfs_remove_dir(kobj);
 	}
+
+	/*
+	 * @kobj->sd may be deleted by an ancestor going away.  Hold an
+	 * extra reference so that it stays until @kobj is gone.
+	 */
+	sysfs_get(kobj->sd);
+
 	return error;
 }
 
@@ -532,10 +539,15 @@ out:
  */
 void kobject_del(struct kobject *kobj)
 {
+	struct sysfs_dirent *sd;
+
 	if (!kobj)
 		return;
 
+	sd = kobj->sd;
 	sysfs_remove_dir(kobj);
+	sysfs_put(sd);
+
 	kobj->state_in_sysfs = 0;
 	kobj_kset_leave(kobj);
 	kobject_put(kobj->parent);
-- 
cgit v1.3


From bcdde7e221a8750f9b62b6d0bd31b72ea4ad9309 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 18 Sep 2013 17:15:37 -0400
Subject: sysfs: make __sysfs_remove_dir() recursive

Currently, sysfs directory removal is inconsistent in that it would
remove any files directly under it but wouldn't recurse into
directories.  Thanks to group subdirectories, this doesn't even match
with kobject boundaries.  sysfs is in the process of being separated
out so that it can be used by multiple subsystems and we want to have
a consistent behavior - either removal of a sysfs_dirent should remove
every descendant entries or none instead of something inbetween.

This patch implements proper recursive removal in
__sysfs_remove_dir().  The function now walks its subtree in a
post-order walk to remove all descendants.

This is a behavior change but kobject / driver layer, which currently
is the only consumer, has already been updated to handle duplicate
removal attempts, so nothing should be broken after this change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 64 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 105a7e2d1660..0cdfd8128d3e 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -789,27 +789,81 @@ void sysfs_remove_subdir(struct sysfs_dirent *sd)
 	remove_dir(sd);
 }
 
+static struct sysfs_dirent *sysfs_leftmost_descendant(struct sysfs_dirent *pos)
+{
+	struct sysfs_dirent *last;
+
+	while (true) {
+		struct rb_node *rbn;
+
+		last = pos;
+
+		if (sysfs_type(pos) != SYSFS_DIR)
+			break;
+
+		rbn = rb_first(&pos->s_dir.children);
+		if (!rbn)
+			break;
+
+		pos = to_sysfs_dirent(rbn);
+	}
+
+	return last;
+}
+
+/**
+ * sysfs_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: sysfs_dirent whose descendants to walk
+ *
+ * Find the next descendant to visit for post-order traversal of @root's
+ * descendants.  @root is included in the iteration and the last node to be
+ * visited.
+ */
+static struct sysfs_dirent *sysfs_next_descendant_post(struct sysfs_dirent *pos,
+						       struct sysfs_dirent *root)
+{
+	struct rb_node *rbn;
+
+	lockdep_assert_held(&sysfs_mutex);
+
+	/* if first iteration, visit leftmost descendant which may be root */
+	if (!pos)
+		return sysfs_leftmost_descendant(root);
+
+	/* if we visited @root, we're done */
+	if (pos == root)
+		return NULL;
+
+	/* if there's an unvisited sibling, visit its leftmost descendant */
+	rbn = rb_next(&pos->s_rb);
+	if (rbn)
+		return sysfs_leftmost_descendant(to_sysfs_dirent(rbn));
+
+	/* no sibling left, visit parent */
+	return pos->s_parent;
+}
 
 static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
 {
 	struct sysfs_addrm_cxt acxt;
-	struct rb_node *pos;
+	struct sysfs_dirent *pos, *next;
 
 	if (!dir_sd)
 		return;
 
 	pr_debug("sysfs %s: removing dir\n", dir_sd->s_name);
 	sysfs_addrm_start(&acxt);
-	pos = rb_first(&dir_sd->s_dir.children);
-	while (pos) {
-		struct sysfs_dirent *sd = to_sysfs_dirent(pos);
-		pos = rb_next(pos);
-		if (sysfs_type(sd) != SYSFS_DIR)
-			sysfs_remove_one(&acxt, sd);
-	}
-	sysfs_addrm_finish(&acxt);
 
-	remove_dir(dir_sd);
+	next = NULL;
+	do {
+		pos = next;
+		next = sysfs_next_descendant_post(pos, dir_sd);
+		if (pos)
+			sysfs_remove_one(&acxt, pos);
+	} while (next);
+
+	sysfs_addrm_finish(&acxt);
 }
 
 /**
@@ -820,7 +874,6 @@ static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
  *	the directory before we remove the directory, and we've inlined
  *	what used to be sysfs_rmdir() below, instead of calling separately.
  */
-
 void sysfs_remove_dir(struct kobject *kobj)
 {
 	struct sysfs_dirent *sd = kobj->sd;
-- 
cgit v1.3


From 250f7c3fee52b71457b4aa2cafadbd9f8b320b31 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 18 Sep 2013 17:15:38 -0400
Subject: sysfs: introduce [__]sysfs_remove()

Given a sysfs_dirent, there is no reason to have multiple versions of
removal functions.  A function which removes the specified
sysfs_dirent and its descendants is enough.

This patch intorduces [__}sysfs_remove() which replaces all internal
variations of removal functions.  This will be the only removal
function in the planned new sysfs_dirent based interface.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c   | 47 ++++++++++++++++++++++++-----------------------
 fs/sysfs/group.c |  4 ++--
 fs/sysfs/inode.c |  2 +-
 fs/sysfs/sysfs.h |  4 ++--
 4 files changed, 29 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 0cdfd8128d3e..b518afd0d11e 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -545,7 +545,8 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
  *	LOCKING:
  *	Determined by sysfs_addrm_start().
  */
-void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
+static void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
+			     struct sysfs_dirent *sd)
 {
 	struct sysfs_inode_attrs *ps_iattr;
 
@@ -775,20 +776,6 @@ const struct inode_operations sysfs_dir_inode_operations = {
 	.setxattr	= sysfs_setxattr,
 };
 
-static void remove_dir(struct sysfs_dirent *sd)
-{
-	struct sysfs_addrm_cxt acxt;
-
-	sysfs_addrm_start(&acxt);
-	sysfs_remove_one(&acxt, sd);
-	sysfs_addrm_finish(&acxt);
-}
-
-void sysfs_remove_subdir(struct sysfs_dirent *sd)
-{
-	remove_dir(sd);
-}
-
 static struct sysfs_dirent *sysfs_leftmost_descendant(struct sysfs_dirent *pos)
 {
 	struct sysfs_dirent *last;
@@ -844,25 +831,36 @@ static struct sysfs_dirent *sysfs_next_descendant_post(struct sysfs_dirent *pos,
 	return pos->s_parent;
 }
 
-static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
+void __sysfs_remove(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
-	struct sysfs_addrm_cxt acxt;
 	struct sysfs_dirent *pos, *next;
 
-	if (!dir_sd)
+	if (!sd)
 		return;
 
-	pr_debug("sysfs %s: removing dir\n", dir_sd->s_name);
-	sysfs_addrm_start(&acxt);
+	pr_debug("sysfs %s: removing\n", sd->s_name);
 
 	next = NULL;
 	do {
 		pos = next;
-		next = sysfs_next_descendant_post(pos, dir_sd);
+		next = sysfs_next_descendant_post(pos, sd);
 		if (pos)
-			sysfs_remove_one(&acxt, pos);
+			sysfs_remove_one(acxt, pos);
 	} while (next);
+}
 
+/**
+ * sysfs_remove - remove a sysfs_dirent recursively
+ * @sd: the sysfs_dirent to remove
+ *
+ * Remove @sd along with all its subdirectories and files.
+ */
+void sysfs_remove(struct sysfs_dirent *sd)
+{
+	struct sysfs_addrm_cxt acxt;
+
+	sysfs_addrm_start(&acxt);
+	__sysfs_remove(&acxt, sd);
 	sysfs_addrm_finish(&acxt);
 }
 
@@ -882,7 +880,10 @@ void sysfs_remove_dir(struct kobject *kobj)
 	kobj->sd = NULL;
 	spin_unlock(&sysfs_assoc_lock);
 
-	__sysfs_remove_dir(sd);
+	if (sd) {
+		WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR);
+		sysfs_remove(sd);
+	}
 }
 
 int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 2dae55c4f7dc..1898a10e38ce 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -111,7 +111,7 @@ static int internal_create_group(struct kobject *kobj, int update,
 	error = create_files(sd, kobj, grp, update);
 	if (error) {
 		if (grp->name)
-			sysfs_remove_subdir(sd);
+			sysfs_remove(sd);
 	}
 	sysfs_put(sd);
 	return error;
@@ -219,7 +219,7 @@ void sysfs_remove_group(struct kobject *kobj,
 
 	remove_files(sd, kobj, grp);
 	if (grp->name)
-		sysfs_remove_subdir(sd);
+		sysfs_remove(sd);
 
 	sysfs_put(sd);
 }
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 364c8873fbda..63f755ef71dd 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -330,7 +330,7 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
 
 	sd = sysfs_find_dirent(dir_sd, name, ns);
 	if (sd)
-		sysfs_remove_one(&acxt, sd);
+		__sysfs_remove(&acxt, sd);
 
 	sysfs_addrm_finish(&acxt);
 
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 4d1154411cdb..4b1d8258b071 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -158,7 +158,8 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
 		    struct sysfs_dirent *parent_sd);
 int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
 		  struct sysfs_dirent *parent_sd);
-void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
+void __sysfs_remove(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
+void sysfs_remove(struct sysfs_dirent *sd);
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
 
 struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
@@ -170,7 +171,6 @@ void release_sysfs_dirent(struct sysfs_dirent *sd);
 
 int sysfs_create_subdir(struct kobject *kobj, const char *name,
 			struct sysfs_dirent **p_sd);
-void sysfs_remove_subdir(struct sysfs_dirent *sd);
 
 int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
 		 const char *new_name, const void *new_ns);
-- 
cgit v1.3


From bef292a72daf215c00aa20f68603de181afbb4d3 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 3 Oct 2013 18:43:20 +0100
Subject: GFS2: Remove obsolete quota tunable

There is no need for a paramater which relates to the internals
of quota to be exposed to users. The only possible use would be
to turn it up so large that the memory allocation fails. So lets
remove it and set it to a sensible value which ensures that we
don't ask for multipage allocations.

Currently the size of struct gfs2_holder means that the caluclated
value is identical to the previous default value, so there should
be no functional change.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Abhijith Das <adas@redhat.com>
---
 fs/gfs2/incore.h     | 1 -
 fs/gfs2/ops_fstype.c | 1 -
 fs/gfs2/quota.c      | 2 +-
 fs/gfs2/sys.c        | 2 --
 4 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 082c8fa7fab9..37b3cd795d6c 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -537,7 +537,6 @@ struct gfs2_tune {
 
 	unsigned int gt_logd_secs;
 
-	unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
 	unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
 	unsigned int gt_quota_scale_num; /* Numerator */
 	unsigned int gt_quota_scale_den; /* Denominator */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 19ff5e8c285c..8e40fda985d6 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -51,7 +51,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 {
 	spin_lock_init(&gt->gt_spin);
 
-	gt->gt_quota_simul_sync = 64;
 	gt->gt_quota_warn_period = 10;
 	gt->gt_quota_scale_num = 1;
 	gt->gt_quota_scale_den = 1;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index fd1ec5243d2e..658abe26a098 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1140,7 +1140,7 @@ int gfs2_quota_sync(struct super_block *sb, int type)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	struct gfs2_quota_data **qda;
-	unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
+	unsigned int max_qd = PAGE_SIZE/sizeof(struct gfs2_holder);
 	unsigned int num_qd;
 	unsigned int x;
 	int error = 0;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index aa5c48044966..d09f6edda0ff 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -587,7 +587,6 @@ TUNE_ATTR(max_readahead, 0);
 TUNE_ATTR(complain_secs, 0);
 TUNE_ATTR(statfs_slow, 0);
 TUNE_ATTR(new_files_jdata, 0);
-TUNE_ATTR(quota_simul_sync, 1);
 TUNE_ATTR(statfs_quantum, 1);
 TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
 
@@ -597,7 +596,6 @@ static struct attribute *tune_attrs[] = {
 	&tune_attr_max_readahead.attr,
 	&tune_attr_complain_secs.attr,
 	&tune_attr_statfs_slow.attr,
-	&tune_attr_quota_simul_sync.attr,
 	&tune_attr_statfs_quantum.attr,
 	&tune_attr_quota_scale.attr,
 	&tune_attr_new_files_jdata.attr,
-- 
cgit v1.3


From 1bf59bf6ded8411058b4c9acc45d9dd8d6965464 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 4 Oct 2013 11:14:46 +0100
Subject: GFS2: Make two similar quota code fragments into a function

There should be no functional change bar the removal of a
test of the MS_READONLY flag which would never be reachable.
This merges the common code from qd_fish and qd_trylock into
a single function and calls it from both those places.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Abhijith Das <adas@redhat.com>
---
 fs/gfs2/quota.c | 60 +++++++++++++++++++++++++--------------------------------
 1 file changed, 26 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 658abe26a098..75414e773ab4 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -383,6 +383,25 @@ static void bh_put(struct gfs2_quota_data *qd)
 	mutex_unlock(&sdp->sd_quota_mutex);
 }
 
+static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd,
+			 u64 *sync_gen)
+{
+	if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
+	    !test_bit(QDF_CHANGE, &qd->qd_flags) ||
+	    (sync_gen && (qd->qd_sync_gen >= *sync_gen)))
+		return 0;
+
+	list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
+
+	set_bit(QDF_LOCKED, &qd->qd_flags);
+	gfs2_assert_warn(sdp, atomic_read(&qd->qd_count));
+	atomic_inc(&qd->qd_count);
+	qd->qd_change_sync = qd->qd_change;
+	gfs2_assert_warn(sdp, qd->qd_slot_count);
+	qd->qd_slot_count++;
+	return 1;
+}
+
 static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
 {
 	struct gfs2_quota_data *qd = NULL;
@@ -397,22 +416,9 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
 	spin_lock(&qd_lru_lock);
 
 	list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
-		if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
-		    !test_bit(QDF_CHANGE, &qd->qd_flags) ||
-		    qd->qd_sync_gen >= sdp->sd_quota_sync_gen)
-			continue;
-
-		list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
-
-		set_bit(QDF_LOCKED, &qd->qd_flags);
-		gfs2_assert_warn(sdp, atomic_read(&qd->qd_count));
-		atomic_inc(&qd->qd_count);
-		qd->qd_change_sync = qd->qd_change;
-		gfs2_assert_warn(sdp, qd->qd_slot_count);
-		qd->qd_slot_count++;
-		found = 1;
-
-		break;
+		found = qd_check_sync(sdp, qd, &sdp->sd_quota_sync_gen);
+		if (found)
+			break;
 	}
 
 	if (!found)
@@ -439,28 +445,14 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
 static int qd_trylock(struct gfs2_quota_data *qd)
 {
 	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
-
-	if (sdp->sd_vfs->s_flags & MS_RDONLY)
-		return 0;
+	int found;
 
 	spin_lock(&qd_lru_lock);
+	found = qd_check_sync(sdp, qd, NULL);
+	spin_unlock(&qd_lru_lock);
 
-	if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
-	    !test_bit(QDF_CHANGE, &qd->qd_flags)) {
-		spin_unlock(&qd_lru_lock);
+	if (!found)
 		return 0;
-	}
-
-	list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
-
-	set_bit(QDF_LOCKED, &qd->qd_flags);
-	gfs2_assert_warn(sdp, atomic_read(&qd->qd_count));
-	atomic_inc(&qd->qd_count);
-	qd->qd_change_sync = qd->qd_change;
-	gfs2_assert_warn(sdp, qd->qd_slot_count);
-	qd->qd_slot_count++;
-
-	spin_unlock(&qd_lru_lock);
 
 	gfs2_assert_warn(sdp, qd->qd_change_sync);
 	if (bh_get(qd)) {
-- 
cgit v1.3


From aabd7c72f52145fcf13f9251770b0b0246b5e406 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 4 Oct 2013 11:31:05 +0100
Subject: GFS2: Inline qd_trylock into gfs2_quota_unlock

The function qd_trylock was not a trylock despite its name and
can be inlined into gfs2_quota_unlock in order to make the
code a bit clearer. There should be no functional change as a
result of this patch.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Abhijith Das <adas@redhat.com>
---
 fs/gfs2/quota.c | 45 ++++++++++++++++++++-------------------------
 1 file changed, 20 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 75414e773ab4..cfb4cdeddacb 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -442,29 +442,6 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
 	return 0;
 }
 
-static int qd_trylock(struct gfs2_quota_data *qd)
-{
-	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
-	int found;
-
-	spin_lock(&qd_lru_lock);
-	found = qd_check_sync(sdp, qd, NULL);
-	spin_unlock(&qd_lru_lock);
-
-	if (!found)
-		return 0;
-
-	gfs2_assert_warn(sdp, qd->qd_change_sync);
-	if (bh_get(qd)) {
-		clear_bit(QDF_LOCKED, &qd->qd_flags);
-		slot_put(qd);
-		qd_put(qd);
-		return 0;
-	}
-
-	return 1;
-}
-
 static void qd_unlock(struct gfs2_quota_data *qd)
 {
 	gfs2_assert_warn(qd->qd_gl->gl_sbd,
@@ -1015,9 +992,11 @@ static int need_sync(struct gfs2_quota_data *qd)
 
 void gfs2_quota_unlock(struct gfs2_inode *ip)
 {
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_quota_data *qda[4];
 	unsigned int count = 0;
 	unsigned int x;
+	int found;
 
 	if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
 		goto out;
@@ -1030,9 +1009,25 @@ void gfs2_quota_unlock(struct gfs2_inode *ip)
 		sync = need_sync(qd);
 
 		gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]);
+		if (!sync)
+			continue;
+
+		spin_lock(&qd_lru_lock);
+		found = qd_check_sync(sdp, qd, NULL);
+		spin_unlock(&qd_lru_lock);
+
+		if (!found)
+			continue;
+
+		gfs2_assert_warn(sdp, qd->qd_change_sync);
+		if (bh_get(qd)) {
+			clear_bit(QDF_LOCKED, &qd->qd_flags);
+			slot_put(qd);
+			qd_put(qd);
+			continue;
+		}
 
-		if (sync && qd_trylock(qd))
-			qda[count++] = qd;
+		qda[count++] = qd;
 	}
 
 	if (count) {
-- 
cgit v1.3


From e46c772dbafc2059b0c7ee87c6a7232baadaf6c7 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 4 Oct 2013 12:29:34 +0100
Subject: GFS2: Protect quota sync generation

Now that gfs2_quota_sync can be potentially called from multiple
threads, we should protect this bit of code, and the sync generation
number in particular in order to ensure that there are no races
when syncing quotas.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Abhijith Das <adas@redhat.com>
---
 fs/gfs2/incore.h     | 1 +
 fs/gfs2/ops_fstype.c | 1 +
 fs/gfs2/quota.c      | 6 ++++--
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 37b3cd795d6c..2ab4f8d8f4c4 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -714,6 +714,7 @@ struct gfs2_sbd {
 	struct list_head sd_quota_list;
 	atomic_t sd_quota_count;
 	struct mutex sd_quota_mutex;
+	struct mutex sd_quota_sync_mutex;
 	wait_queue_head_t sd_quota_wait;
 	struct list_head sd_trunc_list;
 	spinlock_t sd_trunc_lock;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 8e40fda985d6..82303b474958 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -93,6 +93,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 
 	INIT_LIST_HEAD(&sdp->sd_quota_list);
 	mutex_init(&sdp->sd_quota_mutex);
+	mutex_init(&sdp->sd_quota_sync_mutex);
 	init_waitqueue_head(&sdp->sd_quota_wait);
 	INIT_LIST_HEAD(&sdp->sd_trunc_list);
 	spin_lock_init(&sdp->sd_trunc_lock);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index cfb4cdeddacb..4a9726aa191f 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1132,12 +1132,13 @@ int gfs2_quota_sync(struct super_block *sb, int type)
 	unsigned int x;
 	int error = 0;
 
-	sdp->sd_quota_sync_gen++;
-
 	qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL);
 	if (!qda)
 		return -ENOMEM;
 
+	mutex_lock(&sdp->sd_quota_sync_mutex);
+	sdp->sd_quota_sync_gen++;
+
 	do {
 		num_qd = 0;
 
@@ -1162,6 +1163,7 @@ int gfs2_quota_sync(struct super_block *sb, int type)
 		}
 	} while (!error && num_qd == max_qd);
 
+	mutex_unlock(&sdp->sd_quota_sync_mutex);
 	kfree(qda);
 
 	return error;
-- 
cgit v1.3


From 89e51dab7cb026193714f2858dbce203c98ecdec Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Oct 2013 17:41:55 -0400
Subject: sysfs: remove unused sysfs_buffer->pos

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1656a79ea6c0..81e3f727833f 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -44,7 +44,6 @@ struct sysfs_open_dirent {
 
 struct sysfs_buffer {
 	size_t			count;
-	loff_t			pos;
 	char			*page;
 	const struct sysfs_ops	*ops;
 	struct mutex		mutex;
-- 
cgit v1.3


From aea585ef8fa6516395022e9d2fed6ec5014128bc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Oct 2013 17:41:56 -0400
Subject: sysfs: remove sysfs_buffer->needs_read_fill

->needs_read_fill is used to implement the following behaviors.

1. Ensure buffer filling on the first read.
2. Force buffer filling after a write.
3. Force buffer filling after a successful poll.

However, #2 and #3 don't really work as sysfs doesn't reset file
position.  While the read buffer would be refilled, the next read
would continue from the position after the last read or write,
requiring an explicit seek to the start for it to be useful, which
makes ->needs_read_fill superflous as read buffer is always refilled
if f_pos == 0.

Update sysfs_read_file() to test buffer->page for #1 instead and
remove ->needs_read_fill.  While this changes behavior in extreme
corner cases - e.g. re-reading a sysfs file after seeking to non-zero
position after a write or poll, it's highly unlikely to lead to actual
breakage.  This change is to prepare for using seq_file in the read
path.

While at it, reformat a comment in fill_write_buffer().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 81e3f727833f..e2fafc0a9b36 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -47,7 +47,6 @@ struct sysfs_buffer {
 	char			*page;
 	const struct sysfs_ops	*ops;
 	struct mutex		mutex;
-	int			needs_read_fill;
 	int			event;
 	struct list_head	list;
 };
@@ -95,12 +94,10 @@ static int fill_read_buffer(struct dentry *dentry, struct sysfs_buffer *buffer)
 		/* Try to struggle along */
 		count = PAGE_SIZE - 1;
 	}
-	if (count >= 0) {
-		buffer->needs_read_fill = 0;
+	if (count >= 0)
 		buffer->count = count;
-	} else {
+	else
 		ret = count;
-	}
 	return ret;
 }
 
@@ -130,7 +127,11 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	ssize_t retval = 0;
 
 	mutex_lock(&buffer->mutex);
-	if (buffer->needs_read_fill || *ppos == 0) {
+	/*
+	 * Fill on zero offset and the first read so that silly things like
+	 * "dd bs=1 skip=N" can work on sysfs files.
+	 */
+	if (*ppos == 0 || !buffer->page) {
 		retval = fill_read_buffer(file->f_path.dentry, buffer);
 		if (retval)
 			goto out;
@@ -166,14 +167,15 @@ static int fill_write_buffer(struct sysfs_buffer *buffer,
 	if (count >= PAGE_SIZE)
 		count = PAGE_SIZE - 1;
 	error = copy_from_user(buffer->page, buf, count);
-	buffer->needs_read_fill = 1;
-	/* if buf is assumed to contain a string, terminate it by \0,
-	   so e.g. sscanf() can scan the string easily */
+
+	/*
+	 * If buf is assumed to contain a string, terminate it by \0, so
+	 * e.g. sscanf() can scan the string easily.
+	 */
 	buffer->page[count] = 0;
 	return error ? -EFAULT : count;
 }
 
-
 /**
  *	flush_write_buffer - push buffer to kobject.
  *	@dentry:	dentry to the attribute
@@ -368,7 +370,6 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 		goto err_out;
 
 	mutex_init(&buffer->mutex);
-	buffer->needs_read_fill = 1;
 	buffer->ops = ops;
 	file->private_data = buffer;
 
@@ -435,7 +436,6 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
 	return DEFAULT_POLLMASK;
 
  trigger:
-	buffer->needs_read_fill = 1;
 	return DEFAULT_POLLMASK|POLLERR|POLLPRI;
 }
 
-- 
cgit v1.3


From 375b611e60f7c1ce6913417ca254efe5523f1a72 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Oct 2013 17:41:57 -0400
Subject: sysfs: remove sysfs_buffer->ops

Currently, sysfs_ops is fetched during sysfs_open_file() and cached in
sysfs_buffer->ops to be used while the file is open.  This patch
removes the caching and makes each operation directly fetch sysfs_ops.

This patch doesn't introduce any behavior difference and is to prepare
for merging regular and bin file supports.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index e2fafc0a9b36..7dfcc3317490 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -45,12 +45,23 @@ struct sysfs_open_dirent {
 struct sysfs_buffer {
 	size_t			count;
 	char			*page;
-	const struct sysfs_ops	*ops;
 	struct mutex		mutex;
 	int			event;
 	struct list_head	list;
 };
 
+/*
+ * Determine ktype->sysfs_ops for the given sysfs_dirent.  This function
+ * must be called while holding an active reference.
+ */
+static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd)
+{
+	struct kobject *kobj = sd->s_parent->s_dir.kobj;
+
+	lockdep_assert_held(sd);
+	return kobj->ktype ? kobj->ktype->sysfs_ops : NULL;
+}
+
 /**
  *	fill_read_buffer - allocate and fill buffer from object.
  *	@dentry:	dentry pointer.
@@ -66,7 +77,7 @@ static int fill_read_buffer(struct dentry *dentry, struct sysfs_buffer *buffer)
 {
 	struct sysfs_dirent *attr_sd = dentry->d_fsdata;
 	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-	const struct sysfs_ops *ops = buffer->ops;
+	const struct sysfs_ops *ops;
 	int ret = 0;
 	ssize_t count;
 
@@ -80,6 +91,8 @@ static int fill_read_buffer(struct dentry *dentry, struct sysfs_buffer *buffer)
 		return -ENODEV;
 
 	buffer->event = atomic_read(&attr_sd->s_attr.open->event);
+
+	ops = sysfs_file_ops(attr_sd);
 	count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page);
 
 	sysfs_put_active(attr_sd);
@@ -191,13 +204,14 @@ static int flush_write_buffer(struct dentry *dentry,
 {
 	struct sysfs_dirent *attr_sd = dentry->d_fsdata;
 	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-	const struct sysfs_ops *ops = buffer->ops;
+	const struct sysfs_ops *ops;
 	int rc;
 
 	/* need attr_sd for attr and ops, its parent for kobj */
 	if (!sysfs_get_active(attr_sd))
 		return -ENODEV;
 
+	ops = sysfs_file_ops(attr_sd);
 	rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count);
 
 	sysfs_put_active(attr_sd);
@@ -205,7 +219,6 @@ static int flush_write_buffer(struct dentry *dentry,
 	return rc;
 }
 
-
 /**
  *	sysfs_write_file - write an attribute.
  *	@file:	file pointer
@@ -334,14 +347,11 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 		return -ENODEV;
 
 	/* every kobject with an attribute needs a ktype assigned */
-	if (kobj->ktype && kobj->ktype->sysfs_ops)
-		ops = kobj->ktype->sysfs_ops;
-	else {
-		WARN(1, KERN_ERR
-		     "missing sysfs attribute operations for kobject: %s\n",
-		     kobject_name(kobj));
+	ops = sysfs_file_ops(attr_sd);
+	if (WARN(!ops, KERN_ERR
+		 "missing sysfs attribute operations for kobject: %s\n",
+		 kobject_name(kobj)))
 		goto err_out;
-	}
 
 	/* File needs write support.
 	 * The inode's perms must say it's ok,
@@ -370,7 +380,6 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 		goto err_out;
 
 	mutex_init(&buffer->mutex);
-	buffer->ops = ops;
 	file->private_data = buffer;
 
 	/* make sure we have open dirent struct */
-- 
cgit v1.3


From c75ec764cf4746a2406278ffa16f590c5db290a7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Oct 2013 17:41:58 -0400
Subject: sysfs: add sysfs_open_file_mutex

Add a separate mutex to protect sysfs_open_dirent->buffers list.  This
will allow performing sleepable operations while traversing
sysfs_buffers, which will be renamed to sysfs_open_file.

Note that currently sysfs_open_dirent->buffers list isn't being used
for anything and this patch doesn't make any functional difference.
It will be used to merge regular and bin file supports.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 7dfcc3317490..499cff8554fc 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -25,15 +25,17 @@
 #include "sysfs.h"
 
 /*
- * There's one sysfs_buffer for each open file and one
- * sysfs_open_dirent for each sysfs_dirent with one or more open
- * files.
+ * There's one sysfs_buffer for each open file and one sysfs_open_dirent
+ * for each sysfs_dirent with one or more open files.
  *
- * filp->private_data points to sysfs_buffer and
- * sysfs_dirent->s_attr.open points to sysfs_open_dirent.  s_attr.open
- * is protected by sysfs_open_dirent_lock.
+ * sysfs_dirent->s_attr.open points to sysfs_open_dirent.  s_attr.open is
+ * protected by sysfs_open_dirent_lock.
+ *
+ * filp->private_data points to sysfs_buffer which is chained at
+ * sysfs_open_dirent->buffers, which is protected by sysfs_open_file_mutex.
  */
 static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
+static DEFINE_MUTEX(sysfs_open_file_mutex);
 
 struct sysfs_open_dirent {
 	atomic_t		refcnt;
@@ -272,6 +274,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
 	struct sysfs_open_dirent *od, *new_od = NULL;
 
  retry:
+	mutex_lock(&sysfs_open_file_mutex);
 	spin_lock_irq(&sysfs_open_dirent_lock);
 
 	if (!sd->s_attr.open && new_od) {
@@ -286,6 +289,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
 	}
 
 	spin_unlock_irq(&sysfs_open_dirent_lock);
+	mutex_unlock(&sysfs_open_file_mutex);
 
 	if (od) {
 		kfree(new_od);
@@ -321,6 +325,7 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
 	struct sysfs_open_dirent *od = sd->s_attr.open;
 	unsigned long flags;
 
+	mutex_lock(&sysfs_open_file_mutex);
 	spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
 
 	list_del(&buffer->list);
@@ -330,6 +335,7 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
 		od = NULL;
 
 	spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
+	mutex_unlock(&sysfs_open_file_mutex);
 
 	kfree(od);
 }
-- 
cgit v1.3


From 58282d8dc2e7cf2b87c8fee94d7138ed08e0a2e5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Oct 2013 17:41:59 -0400
Subject: sysfs: rename sysfs_buffer to sysfs_open_file

sysfs read path will be converted to use seq_file which will handle
buffering making sysfs_buffer a misnomer.  Rename sysfs_buffer to
sysfs_open_file, and sysfs_open_dirent->buffers to ->files.

This path is pure rename.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c | 127 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 63 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 499cff8554fc..4b55bcf4422e 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -25,14 +25,14 @@
 #include "sysfs.h"
 
 /*
- * There's one sysfs_buffer for each open file and one sysfs_open_dirent
+ * There's one sysfs_open_file for each open file and one sysfs_open_dirent
  * for each sysfs_dirent with one or more open files.
  *
  * sysfs_dirent->s_attr.open points to sysfs_open_dirent.  s_attr.open is
  * protected by sysfs_open_dirent_lock.
  *
- * filp->private_data points to sysfs_buffer which is chained at
- * sysfs_open_dirent->buffers, which is protected by sysfs_open_file_mutex.
+ * filp->private_data points to sysfs_open_file which is chained at
+ * sysfs_open_dirent->files, which is protected by sysfs_open_file_mutex.
  */
 static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
 static DEFINE_MUTEX(sysfs_open_file_mutex);
@@ -41,10 +41,10 @@ struct sysfs_open_dirent {
 	atomic_t		refcnt;
 	atomic_t		event;
 	wait_queue_head_t	poll;
-	struct list_head	buffers; /* goes through sysfs_buffer.list */
+	struct list_head	files; /* goes through sysfs_open_file.list */
 };
 
-struct sysfs_buffer {
+struct sysfs_open_file {
 	size_t			count;
 	char			*page;
 	struct mutex		mutex;
@@ -75,7 +75,7 @@ static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd)
  *	This is called only once, on the file's first read unless an error
  *	is returned.
  */
-static int fill_read_buffer(struct dentry *dentry, struct sysfs_buffer *buffer)
+static int fill_read_buffer(struct dentry *dentry, struct sysfs_open_file *of)
 {
 	struct sysfs_dirent *attr_sd = dentry->d_fsdata;
 	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
@@ -83,19 +83,19 @@ static int fill_read_buffer(struct dentry *dentry, struct sysfs_buffer *buffer)
 	int ret = 0;
 	ssize_t count;
 
-	if (!buffer->page)
-		buffer->page = (char *) get_zeroed_page(GFP_KERNEL);
-	if (!buffer->page)
+	if (!of->page)
+		of->page = (char *) get_zeroed_page(GFP_KERNEL);
+	if (!of->page)
 		return -ENOMEM;
 
 	/* need attr_sd for attr and ops, its parent for kobj */
 	if (!sysfs_get_active(attr_sd))
 		return -ENODEV;
 
-	buffer->event = atomic_read(&attr_sd->s_attr.open->event);
+	of->event = atomic_read(&attr_sd->s_attr.open->event);
 
 	ops = sysfs_file_ops(attr_sd);
-	count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page);
+	count = ops->show(kobj, attr_sd->s_attr.attr, of->page);
 
 	sysfs_put_active(attr_sd);
 
@@ -110,7 +110,7 @@ static int fill_read_buffer(struct dentry *dentry, struct sysfs_buffer *buffer)
 		count = PAGE_SIZE - 1;
 	}
 	if (count >= 0)
-		buffer->count = count;
+		of->count = count;
 	else
 		ret = count;
 	return ret;
@@ -138,63 +138,62 @@ static int fill_read_buffer(struct dentry *dentry, struct sysfs_buffer *buffer)
 static ssize_t
 sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
-	struct sysfs_buffer *buffer = file->private_data;
+	struct sysfs_open_file *of = file->private_data;
 	ssize_t retval = 0;
 
-	mutex_lock(&buffer->mutex);
+	mutex_lock(&of->mutex);
 	/*
 	 * Fill on zero offset and the first read so that silly things like
 	 * "dd bs=1 skip=N" can work on sysfs files.
 	 */
-	if (*ppos == 0 || !buffer->page) {
-		retval = fill_read_buffer(file->f_path.dentry, buffer);
+	if (*ppos == 0 || !of->page) {
+		retval = fill_read_buffer(file->f_path.dentry, of);
 		if (retval)
 			goto out;
 	}
 	pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
-		 __func__, count, *ppos, buffer->page);
-	retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
-					 buffer->count);
+		 __func__, count, *ppos, of->page);
+	retval = simple_read_from_buffer(buf, count, ppos, of->page, of->count);
 out:
-	mutex_unlock(&buffer->mutex);
+	mutex_unlock(&of->mutex);
 	return retval;
 }
 
 /**
  *	fill_write_buffer - copy buffer from userspace.
- *	@buffer:	data buffer for file.
+ *	@of:		open file struct.
  *	@buf:		data from user.
  *	@count:		number of bytes in @userbuf.
  *
- *	Allocate @buffer->page if it hasn't been already, then
- *	copy the user-supplied buffer into it.
+ *	Allocate @of->page if it hasn't been already, then copy the
+ *	user-supplied buffer into it.
  */
-static int fill_write_buffer(struct sysfs_buffer *buffer,
+static int fill_write_buffer(struct sysfs_open_file *of,
 			     const char __user *buf, size_t count)
 {
 	int error;
 
-	if (!buffer->page)
-		buffer->page = (char *)get_zeroed_page(GFP_KERNEL);
-	if (!buffer->page)
+	if (!of->page)
+		of->page = (char *)get_zeroed_page(GFP_KERNEL);
+	if (!of->page)
 		return -ENOMEM;
 
 	if (count >= PAGE_SIZE)
 		count = PAGE_SIZE - 1;
-	error = copy_from_user(buffer->page, buf, count);
+	error = copy_from_user(of->page, buf, count);
 
 	/*
 	 * If buf is assumed to contain a string, terminate it by \0, so
 	 * e.g. sscanf() can scan the string easily.
 	 */
-	buffer->page[count] = 0;
+	of->page[count] = 0;
 	return error ? -EFAULT : count;
 }
 
 /**
  *	flush_write_buffer - push buffer to kobject.
  *	@dentry:	dentry to the attribute
- *	@buffer:	data buffer for file.
+ *	@of:		open file
  *	@count:		number of bytes
  *
  *	Get the correct pointers for the kobject and the attribute we're
@@ -202,7 +201,7 @@ static int fill_write_buffer(struct sysfs_buffer *buffer,
  *	passing the buffer that we acquired in fill_write_buffer().
  */
 static int flush_write_buffer(struct dentry *dentry,
-			      struct sysfs_buffer *buffer, size_t count)
+			      struct sysfs_open_file *of, size_t count)
 {
 	struct sysfs_dirent *attr_sd = dentry->d_fsdata;
 	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
@@ -214,7 +213,7 @@ static int flush_write_buffer(struct dentry *dentry,
 		return -ENODEV;
 
 	ops = sysfs_file_ops(attr_sd);
-	rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count);
+	rc = ops->store(kobj, attr_sd->s_attr.attr, of->page, count);
 
 	sysfs_put_active(attr_sd);
 
@@ -240,27 +239,26 @@ static int flush_write_buffer(struct dentry *dentry,
 static ssize_t sysfs_write_file(struct file *file, const char __user *buf,
 				size_t count, loff_t *ppos)
 {
-	struct sysfs_buffer *buffer = file->private_data;
+	struct sysfs_open_file *of = file->private_data;
 	ssize_t len;
 
-	mutex_lock(&buffer->mutex);
-	len = fill_write_buffer(buffer, buf, count);
+	mutex_lock(&of->mutex);
+	len = fill_write_buffer(of, buf, count);
 	if (len > 0)
-		len = flush_write_buffer(file->f_path.dentry, buffer, len);
+		len = flush_write_buffer(file->f_path.dentry, of, len);
 	if (len > 0)
 		*ppos += len;
-	mutex_unlock(&buffer->mutex);
+	mutex_unlock(&of->mutex);
 	return len;
 }
 
 /**
  *	sysfs_get_open_dirent - get or create sysfs_open_dirent
  *	@sd: target sysfs_dirent
- *	@buffer: sysfs_buffer for this instance of open
+ *	@of: sysfs_open_file for this instance of open
  *
  *	If @sd->s_attr.open exists, increment its reference count;
- *	otherwise, create one.  @buffer is chained to the buffers
- *	list.
+ *	otherwise, create one.  @of is chained to the files list.
  *
  *	LOCKING:
  *	Kernel thread context (may sleep).
@@ -269,7 +267,7 @@ static ssize_t sysfs_write_file(struct file *file, const char __user *buf,
  *	0 on success, -errno on failure.
  */
 static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
-				 struct sysfs_buffer *buffer)
+				 struct sysfs_open_file *of)
 {
 	struct sysfs_open_dirent *od, *new_od = NULL;
 
@@ -285,7 +283,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
 	od = sd->s_attr.open;
 	if (od) {
 		atomic_inc(&od->refcnt);
-		list_add_tail(&buffer->list, &od->buffers);
+		list_add_tail(&of->list, &od->files);
 	}
 
 	spin_unlock_irq(&sysfs_open_dirent_lock);
@@ -304,23 +302,23 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
 	atomic_set(&new_od->refcnt, 0);
 	atomic_set(&new_od->event, 1);
 	init_waitqueue_head(&new_od->poll);
-	INIT_LIST_HEAD(&new_od->buffers);
+	INIT_LIST_HEAD(&new_od->files);
 	goto retry;
 }
 
 /**
  *	sysfs_put_open_dirent - put sysfs_open_dirent
  *	@sd: target sysfs_dirent
- *	@buffer: associated sysfs_buffer
+ *	@of: associated sysfs_open_file
  *
- *	Put @sd->s_attr.open and unlink @buffer from the buffers list.
- *	If reference count reaches zero, disassociate and free it.
+ *	Put @sd->s_attr.open and unlink @of from the files list.  If
+ *	reference count reaches zero, disassociate and free it.
  *
  *	LOCKING:
  *	None.
  */
 static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
-				  struct sysfs_buffer *buffer)
+				  struct sysfs_open_file *of)
 {
 	struct sysfs_open_dirent *od = sd->s_attr.open;
 	unsigned long flags;
@@ -328,7 +326,7 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
 	mutex_lock(&sysfs_open_file_mutex);
 	spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
 
-	list_del(&buffer->list);
+	list_del(&of->list);
 	if (atomic_dec_and_test(&od->refcnt))
 		sd->s_attr.open = NULL;
 	else
@@ -344,7 +342,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 {
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-	struct sysfs_buffer *buffer;
+	struct sysfs_open_file *of;
 	const struct sysfs_ops *ops;
 	int error = -EACCES;
 
@@ -377,19 +375,20 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 			goto err_out;
 	}
 
-	/* No error? Great, allocate a buffer for the file, and store it
-	 * it in file->private_data for easy access.
+	/*
+	 * No error? Great, allocate a sysfs_open_file for the file, and
+	 * store it it in file->private_data for easy access.
 	 */
 	error = -ENOMEM;
-	buffer = kzalloc(sizeof(struct sysfs_buffer), GFP_KERNEL);
-	if (!buffer)
+	of = kzalloc(sizeof(struct sysfs_open_file), GFP_KERNEL);
+	if (!of)
 		goto err_out;
 
-	mutex_init(&buffer->mutex);
-	file->private_data = buffer;
+	mutex_init(&of->mutex);
+	file->private_data = of;
 
 	/* make sure we have open dirent struct */
-	error = sysfs_get_open_dirent(attr_sd, buffer);
+	error = sysfs_get_open_dirent(attr_sd, of);
 	if (error)
 		goto err_free;
 
@@ -398,7 +397,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	return 0;
 
  err_free:
-	kfree(buffer);
+	kfree(of);
  err_out:
 	sysfs_put_active(attr_sd);
 	return error;
@@ -407,13 +406,13 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 static int sysfs_release(struct inode *inode, struct file *filp)
 {
 	struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata;
-	struct sysfs_buffer *buffer = filp->private_data;
+	struct sysfs_open_file *of = filp->private_data;
 
-	sysfs_put_open_dirent(sd, buffer);
+	sysfs_put_open_dirent(sd, of);
 
-	if (buffer->page)
-		free_page((unsigned long)buffer->page);
-	kfree(buffer);
+	if (of->page)
+		free_page((unsigned long)of->page);
+	kfree(of);
 
 	return 0;
 }
@@ -433,7 +432,7 @@ static int sysfs_release(struct inode *inode, struct file *filp)
  */
 static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
 {
-	struct sysfs_buffer *buffer = filp->private_data;
+	struct sysfs_open_file *of = filp->private_data;
 	struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
 	struct sysfs_open_dirent *od = attr_sd->s_attr.open;
 
@@ -445,7 +444,7 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
 
 	sysfs_put_active(attr_sd);
 
-	if (buffer->event != atomic_read(&od->event))
+	if (of->event != atomic_read(&od->event))
 		goto trigger;
 
 	return DEFAULT_POLLMASK;
-- 
cgit v1.3


From bcafe4eea3e58a60e9c2c63781700a9ab1d70f93 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Oct 2013 17:42:00 -0400
Subject: sysfs: add sysfs_open_file->sd and ->file

sysfs will be converted to use seq_file for read path, which will make
it difficult to pass around multiple pointers directly.  This patch
adds sysfs_open_file->sd and ->file so that we can reach all the
necessary data structures from sysfs_open_file.

flush_write_buffer() is updated to drop @dentry which was used to
discover the sysfs_dirent as it's now available through
sysfs_open_file->sd.

This patch doesn't cause any behavior difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 4b55bcf4422e..af6e9092a679 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -45,6 +45,8 @@ struct sysfs_open_dirent {
 };
 
 struct sysfs_open_file {
+	struct sysfs_dirent	*sd;
+	struct file		*file;
 	size_t			count;
 	char			*page;
 	struct mutex		mutex;
@@ -192,7 +194,6 @@ static int fill_write_buffer(struct sysfs_open_file *of,
 
 /**
  *	flush_write_buffer - push buffer to kobject.
- *	@dentry:	dentry to the attribute
  *	@of:		open file
  *	@count:		number of bytes
  *
@@ -200,22 +201,20 @@ static int fill_write_buffer(struct sysfs_open_file *of,
  *	dealing with, then call the store() method for the attribute,
  *	passing the buffer that we acquired in fill_write_buffer().
  */
-static int flush_write_buffer(struct dentry *dentry,
-			      struct sysfs_open_file *of, size_t count)
+static int flush_write_buffer(struct sysfs_open_file *of, size_t count)
 {
-	struct sysfs_dirent *attr_sd = dentry->d_fsdata;
-	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
+	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
 	const struct sysfs_ops *ops;
 	int rc;
 
-	/* need attr_sd for attr and ops, its parent for kobj */
-	if (!sysfs_get_active(attr_sd))
+	/* need @of->sd for attr and ops, its parent for kobj */
+	if (!sysfs_get_active(of->sd))
 		return -ENODEV;
 
-	ops = sysfs_file_ops(attr_sd);
-	rc = ops->store(kobj, attr_sd->s_attr.attr, of->page, count);
+	ops = sysfs_file_ops(of->sd);
+	rc = ops->store(kobj, of->sd->s_attr.attr, of->page, count);
 
-	sysfs_put_active(attr_sd);
+	sysfs_put_active(of->sd);
 
 	return rc;
 }
@@ -245,7 +244,7 @@ static ssize_t sysfs_write_file(struct file *file, const char __user *buf,
 	mutex_lock(&of->mutex);
 	len = fill_write_buffer(of, buf, count);
 	if (len > 0)
-		len = flush_write_buffer(file->f_path.dentry, of, len);
+		len = flush_write_buffer(of, len);
 	if (len > 0)
 		*ppos += len;
 	mutex_unlock(&of->mutex);
@@ -385,6 +384,8 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 		goto err_out;
 
 	mutex_init(&of->mutex);
+	of->sd = attr_sd;
+	of->file = file;
 	file->private_data = of;
 
 	/* make sure we have open dirent struct */
-- 
cgit v1.3


From 8ef445f0807457dd7d158e43d9e8f9568c47910d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Oct 2013 17:42:01 -0400
Subject: sysfs: use transient write buffer

There isn't much to be gained by keeping around kernel buffer while a
file is open especially as the read path planned to be converted to
use seq_file and won't use the buffer.  This patch makes
sysfs_write_file() use per-write transient buffer instead of
sysfs_open_file->page.

This simplifies the write path, enables removing sysfs_open_file->page
once read path is updated and will help merging bin file write path
which already requires the use of a transient buffer due to a locking
order issue.

As the function comments of flush_write_buffer() and
sysfs_write_buffer() are being updated anyway, reformat them so that
they're more conventional.

v2: Use min_t() instead of min() in sysfs_write_file() to avoid build
    warning on arm.  Reported by build test robot.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c | 114 ++++++++++++++++++++++++++------------------------------
 1 file changed, 52 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index af6e9092a679..53cc096e6a1b 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -162,92 +162,82 @@ out:
 }
 
 /**
- *	fill_write_buffer - copy buffer from userspace.
- *	@of:		open file struct.
- *	@buf:		data from user.
- *	@count:		number of bytes in @userbuf.
+ * flush_write_buffer - push buffer to kobject
+ * @of: open file
+ * @buf: data buffer for file
+ * @count: number of bytes
  *
- *	Allocate @of->page if it hasn't been already, then copy the
- *	user-supplied buffer into it.
+ * Get the correct pointers for the kobject and the attribute we're dealing
+ * with, then call the store() method for it with @buf.
  */
-static int fill_write_buffer(struct sysfs_open_file *of,
-			     const char __user *buf, size_t count)
-{
-	int error;
-
-	if (!of->page)
-		of->page = (char *)get_zeroed_page(GFP_KERNEL);
-	if (!of->page)
-		return -ENOMEM;
-
-	if (count >= PAGE_SIZE)
-		count = PAGE_SIZE - 1;
-	error = copy_from_user(of->page, buf, count);
-
-	/*
-	 * If buf is assumed to contain a string, terminate it by \0, so
-	 * e.g. sscanf() can scan the string easily.
-	 */
-	of->page[count] = 0;
-	return error ? -EFAULT : count;
-}
-
-/**
- *	flush_write_buffer - push buffer to kobject.
- *	@of:		open file
- *	@count:		number of bytes
- *
- *	Get the correct pointers for the kobject and the attribute we're
- *	dealing with, then call the store() method for the attribute,
- *	passing the buffer that we acquired in fill_write_buffer().
- */
-static int flush_write_buffer(struct sysfs_open_file *of, size_t count)
+static int flush_write_buffer(struct sysfs_open_file *of, char *buf,
+			      size_t count)
 {
 	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
 	const struct sysfs_ops *ops;
-	int rc;
+	int rc = 0;
 
-	/* need @of->sd for attr and ops, its parent for kobj */
-	if (!sysfs_get_active(of->sd))
+	/*
+	 * Need @of->sd for attr and ops, its parent for kobj.  @of->mutex
+	 * nests outside active ref and is just to ensure that the ops
+	 * aren't called concurrently for the same open file.
+	 */
+	mutex_lock(&of->mutex);
+	if (!sysfs_get_active(of->sd)) {
+		mutex_unlock(&of->mutex);
 		return -ENODEV;
+	}
 
 	ops = sysfs_file_ops(of->sd);
-	rc = ops->store(kobj, of->sd->s_attr.attr, of->page, count);
+	rc = ops->store(kobj, of->sd->s_attr.attr, buf, count);
 
 	sysfs_put_active(of->sd);
+	mutex_unlock(&of->mutex);
 
 	return rc;
 }
 
 /**
- *	sysfs_write_file - write an attribute.
- *	@file:	file pointer
- *	@buf:	data to write
- *	@count:	number of bytes
- *	@ppos:	starting offset
+ * sysfs_write_file - write an attribute
+ * @file: file pointer
+ * @user_buf: data to write
+ * @count: number of bytes
+ * @ppos: starting offset
  *
- *	Similar to sysfs_read_file(), though working in the opposite direction.
- *	We allocate and fill the data from the user in fill_write_buffer(),
- *	then push it to the kobject in flush_write_buffer().
- *	There is no easy way for us to know if userspace is only doing a partial
- *	write, so we don't support them. We expect the entire buffer to come
- *	on the first write.
- *	Hint: if you're writing a value, first read the file, modify only the
- *	the value you're changing, then write entire buffer back.
+ * Copy data in from userland and pass it to the matching
+ * sysfs_ops->store() by invoking flush_write_buffer().
+ *
+ * There is no easy way for us to know if userspace is only doing a partial
+ * write, so we don't support them. We expect the entire buffer to come on
+ * the first write.  Hint: if you're writing a value, first read the file,
+ * modify only the the value you're changing, then write entire buffer
+ * back.
  */
-static ssize_t sysfs_write_file(struct file *file, const char __user *buf,
+static ssize_t sysfs_write_file(struct file *file, const char __user *user_buf,
 				size_t count, loff_t *ppos)
 {
 	struct sysfs_open_file *of = file->private_data;
-	ssize_t len;
+	ssize_t len = min_t(size_t, count, PAGE_SIZE - 1);
+	char *buf;
 
-	mutex_lock(&of->mutex);
-	len = fill_write_buffer(of, buf, count);
-	if (len > 0)
-		len = flush_write_buffer(of, len);
+	if (!len)
+		return 0;
+
+	buf = kmalloc(len + 1, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	if (copy_from_user(buf, user_buf, len)) {
+		len = -EFAULT;
+		goto out_free;
+	}
+	buf[len] = '\0';	/* guarantee string termination */
+
+	len = flush_write_buffer(of, buf, len);
 	if (len > 0)
 		*ppos += len;
-	mutex_unlock(&of->mutex);
+out_free:
+	kfree(buf);
 	return len;
 }
 
-- 
cgit v1.3


From 13c589d5b0ac654d9da7e490a2dd548e6b86b4a5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Oct 2013 17:42:02 -0400
Subject: sysfs: use seq_file when reading regular files

sysfs read path implements its own buffering scheme between userland
and kernel callbacks, which essentially is a degenerate duplicate of
seq_file.  This patch replaces the custom read buffering
implementation in sysfs with seq_file.

While the amount of code reduction is small, this reduces low level
hairiness and enables future development of a new versatile API based
on seq_file so that sysfs features can be shared with other
subsystems.

As write path was already converted to not use sysfs_open_file->page,
this patch makes ->page and ->count unused and removes them.

Userland behavior remains the same except for some extreme corner
cases - e.g. sysfs will now regenerate the content each time a file is
read after a non-contiguous seek whereas the original code would keep
using the same content.  While this is a userland visible behavior
change, it is extremely unlikely to be noticeable and brings sysfs
behavior closer to that of procfs.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c | 164 +++++++++++++++++++++++++-------------------------------
 1 file changed, 73 insertions(+), 91 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 53cc096e6a1b..4921bda3a37a 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -21,6 +21,7 @@
 #include <linux/mutex.h>
 #include <linux/limits.h>
 #include <linux/uaccess.h>
+#include <linux/seq_file.h>
 
 #include "sysfs.h"
 
@@ -31,7 +32,8 @@
  * sysfs_dirent->s_attr.open points to sysfs_open_dirent.  s_attr.open is
  * protected by sysfs_open_dirent_lock.
  *
- * filp->private_data points to sysfs_open_file which is chained at
+ * filp->private_data points to seq_file whose ->private points to
+ * sysfs_open_file.  sysfs_open_files are chained at
  * sysfs_open_dirent->files, which is protected by sysfs_open_file_mutex.
  */
 static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
@@ -47,13 +49,16 @@ struct sysfs_open_dirent {
 struct sysfs_open_file {
 	struct sysfs_dirent	*sd;
 	struct file		*file;
-	size_t			count;
-	char			*page;
 	struct mutex		mutex;
 	int			event;
 	struct list_head	list;
 };
 
+static struct sysfs_open_file *sysfs_of(struct file *file)
+{
+	return ((struct seq_file *)file->private_data)->private;
+}
+
 /*
  * Determine ktype->sysfs_ops for the given sysfs_dirent.  This function
  * must be called while holding an active reference.
@@ -66,40 +71,54 @@ static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd)
 	return kobj->ktype ? kobj->ktype->sysfs_ops : NULL;
 }
 
-/**
- *	fill_read_buffer - allocate and fill buffer from object.
- *	@dentry:	dentry pointer.
- *	@buffer:	data buffer for file.
- *
- *	Allocate @buffer->page, if it hasn't been already, then call the
- *	kobject's show() method to fill the buffer with this attribute's
- *	data.
- *	This is called only once, on the file's first read unless an error
- *	is returned.
+/*
+ * Reads on sysfs are handled through seq_file, which takes care of hairy
+ * details like buffering and seeking.  The following function pipes
+ * sysfs_ops->show() result through seq_file.
  */
-static int fill_read_buffer(struct dentry *dentry, struct sysfs_open_file *of)
+static int sysfs_seq_show(struct seq_file *sf, void *v)
 {
-	struct sysfs_dirent *attr_sd = dentry->d_fsdata;
-	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
+	struct sysfs_open_file *of = sf->private;
+	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
 	const struct sysfs_ops *ops;
-	int ret = 0;
+	char *buf;
 	ssize_t count;
 
-	if (!of->page)
-		of->page = (char *) get_zeroed_page(GFP_KERNEL);
-	if (!of->page)
-		return -ENOMEM;
+	/* acquire buffer and ensure that it's >= PAGE_SIZE */
+	count = seq_get_buf(sf, &buf);
+	if (count < PAGE_SIZE) {
+		seq_commit(sf, -1);
+		return 0;
+	}
 
-	/* need attr_sd for attr and ops, its parent for kobj */
-	if (!sysfs_get_active(attr_sd))
+	/*
+	 * Need @of->sd for attr and ops, its parent for kobj.  @of->mutex
+	 * nests outside active ref and is just to ensure that the ops
+	 * aren't called concurrently for the same open file.
+	 */
+	mutex_lock(&of->mutex);
+	if (!sysfs_get_active(of->sd)) {
+		mutex_unlock(&of->mutex);
 		return -ENODEV;
+	}
 
-	of->event = atomic_read(&attr_sd->s_attr.open->event);
+	of->event = atomic_read(&of->sd->s_attr.open->event);
 
-	ops = sysfs_file_ops(attr_sd);
-	count = ops->show(kobj, attr_sd->s_attr.attr, of->page);
+	/*
+	 * Lookup @ops and invoke show().  Control may reach here via seq
+	 * file lseek even if @ops->show() isn't implemented.
+	 */
+	ops = sysfs_file_ops(of->sd);
+	if (ops->show)
+		count = ops->show(kobj, of->sd->s_attr.attr, buf);
+	else
+		count = 0;
 
-	sysfs_put_active(attr_sd);
+	sysfs_put_active(of->sd);
+	mutex_unlock(&of->mutex);
+
+	if (count < 0)
+		return count;
 
 	/*
 	 * The code works fine with PAGE_SIZE return but it's likely to
@@ -111,54 +130,8 @@ static int fill_read_buffer(struct dentry *dentry, struct sysfs_open_file *of)
 		/* Try to struggle along */
 		count = PAGE_SIZE - 1;
 	}
-	if (count >= 0)
-		of->count = count;
-	else
-		ret = count;
-	return ret;
-}
-
-/**
- *	sysfs_read_file - read an attribute.
- *	@file:	file pointer.
- *	@buf:	buffer to fill.
- *	@count:	number of bytes to read.
- *	@ppos:	starting offset in file.
- *
- *	Userspace wants to read an attribute file. The attribute descriptor
- *	is in the file's ->d_fsdata. The target object is in the directory's
- *	->d_fsdata.
- *
- *	We call fill_read_buffer() to allocate and fill the buffer from the
- *	object's show() method exactly once (if the read is happening from
- *	the beginning of the file). That should fill the entire buffer with
- *	all the data the object has to offer for that attribute.
- *	We then call flush_read_buffer() to copy the buffer to userspace
- *	in the increments specified.
- */
-
-static ssize_t
-sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
-{
-	struct sysfs_open_file *of = file->private_data;
-	ssize_t retval = 0;
-
-	mutex_lock(&of->mutex);
-	/*
-	 * Fill on zero offset and the first read so that silly things like
-	 * "dd bs=1 skip=N" can work on sysfs files.
-	 */
-	if (*ppos == 0 || !of->page) {
-		retval = fill_read_buffer(file->f_path.dentry, of);
-		if (retval)
-			goto out;
-	}
-	pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
-		 __func__, count, *ppos, of->page);
-	retval = simple_read_from_buffer(buf, count, ppos, of->page, of->count);
-out:
-	mutex_unlock(&of->mutex);
-	return retval;
+	seq_commit(sf, count);
+	return 0;
 }
 
 /**
@@ -216,7 +189,7 @@ static int flush_write_buffer(struct sysfs_open_file *of, char *buf,
 static ssize_t sysfs_write_file(struct file *file, const char __user *user_buf,
 				size_t count, loff_t *ppos)
 {
-	struct sysfs_open_file *of = file->private_data;
+	struct sysfs_open_file *of = sysfs_of(file);
 	ssize_t len = min_t(size_t, count, PAGE_SIZE - 1);
 	char *buf;
 
@@ -364,10 +337,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 			goto err_out;
 	}
 
-	/*
-	 * No error? Great, allocate a sysfs_open_file for the file, and
-	 * store it it in file->private_data for easy access.
-	 */
+	/* allocate a sysfs_open_file for the file */
 	error = -ENOMEM;
 	of = kzalloc(sizeof(struct sysfs_open_file), GFP_KERNEL);
 	if (!of)
@@ -376,20 +346,34 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	mutex_init(&of->mutex);
 	of->sd = attr_sd;
 	of->file = file;
-	file->private_data = of;
+
+	/*
+	 * Always instantiate seq_file even if read access is not
+	 * implemented or requested.  This unifies private data access and
+	 * most files are readable anyway.
+	 */
+	error = single_open(file, sysfs_seq_show, of);
+	if (error)
+		goto err_free;
+
+	/* seq_file clears PWRITE unconditionally, restore it if WRITE */
+	if (file->f_mode & FMODE_WRITE)
+		file->f_mode |= FMODE_PWRITE;
 
 	/* make sure we have open dirent struct */
 	error = sysfs_get_open_dirent(attr_sd, of);
 	if (error)
-		goto err_free;
+		goto err_close;
 
 	/* open succeeded, put active references */
 	sysfs_put_active(attr_sd);
 	return 0;
 
- err_free:
+err_close:
+	single_release(inode, file);
+err_free:
 	kfree(of);
- err_out:
+err_out:
 	sysfs_put_active(attr_sd);
 	return error;
 }
@@ -397,12 +381,10 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 static int sysfs_release(struct inode *inode, struct file *filp)
 {
 	struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata;
-	struct sysfs_open_file *of = filp->private_data;
+	struct sysfs_open_file *of = sysfs_of(filp);
 
 	sysfs_put_open_dirent(sd, of);
-
-	if (of->page)
-		free_page((unsigned long)of->page);
+	single_release(inode, filp);
 	kfree(of);
 
 	return 0;
@@ -423,7 +405,7 @@ static int sysfs_release(struct inode *inode, struct file *filp)
  */
 static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
 {
-	struct sysfs_open_file *of = filp->private_data;
+	struct sysfs_open_file *of = sysfs_of(filp);
 	struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
 	struct sysfs_open_dirent *od = attr_sd->s_attr.open;
 
@@ -481,9 +463,9 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
 EXPORT_SYMBOL_GPL(sysfs_notify);
 
 const struct file_operations sysfs_file_operations = {
-	.read		= sysfs_read_file,
+	.read		= seq_read,
 	.write		= sysfs_write_file,
-	.llseek		= generic_file_llseek,
+	.llseek		= seq_lseek,
 	.open		= sysfs_open_file,
 	.release	= sysfs_release,
 	.poll		= sysfs_poll,
-- 
cgit v1.3


From 91270162bf8a86bee756da7e931591d9953cb5a8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Oct 2013 17:42:03 -0400
Subject: sysfs: skip bin_buffer->buffer while reading

After b31ca3f5dfc ("sysfs: fix deadlock"), bin read() first writes
data to bb->buffer and bounces it to a transient kernel buffer which
is then copied out to userland.  The double bouncing doesn't add
anything.  Let's just use the transient buffer directly.

While at it, rename @temp to @buf for clarity.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/bin.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index d49e6ca3345b..d2142c0648eb 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -72,7 +72,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 	int size = file_inode(file)->i_size;
 	loff_t offs = *off;
 	int count = min_t(size_t, bytes, PAGE_SIZE);
-	char *temp;
+	char *buf;
 
 	if (!bytes)
 		return 0;
@@ -84,23 +84,18 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 			count = size - offs;
 	}
 
-	temp = kmalloc(count, GFP_KERNEL);
-	if (!temp)
+	buf = kmalloc(count, GFP_KERNEL);
+	if (!buf)
 		return -ENOMEM;
 
 	mutex_lock(&bb->mutex);
+	count = fill_read(file, buf, offs, count);
+	mutex_unlock(&bb->mutex);
 
-	count = fill_read(file, bb->buffer, offs, count);
-	if (count < 0) {
-		mutex_unlock(&bb->mutex);
+	if (count < 0)
 		goto out_free;
-	}
-
-	memcpy(temp, bb->buffer, count);
 
-	mutex_unlock(&bb->mutex);
-
-	if (copy_to_user(userbuf, temp, count)) {
+	if (copy_to_user(userbuf, buf, count)) {
 		count = -EFAULT;
 		goto out_free;
 	}
@@ -110,7 +105,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 	*off = offs + count;
 
  out_free:
-	kfree(temp);
+	kfree(buf);
 	return count;
 }
 
-- 
cgit v1.3


From 3ff65d3cb09ee642363bcaf12c6c38670263d93a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Oct 2013 17:42:04 -0400
Subject: sysfs: collapse fs/sysfs/bin.c::fill_read() into read()

read() is simple enough and fill_read() being in a separate function
doesn't add anything.  Let's collapse it into read().  This will make
merging bin file handling with regular file.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/bin.c | 36 +++++++++++++++---------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index d2142c0648eb..60a4e787dc4f 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -44,30 +44,12 @@ struct bin_buffer {
 	struct hlist_node		list;
 };
 
-static int
-fill_read(struct file *file, char *buffer, loff_t off, size_t count)
+static ssize_t
+read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 {
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
 	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-	int rc;
-
-	/* need attr_sd for attr, its parent for kobj */
-	if (!sysfs_get_active(attr_sd))
-		return -ENODEV;
-
-	rc = -EIO;
-	if (attr->read)
-		rc = attr->read(file, kobj, attr, buffer, off, count);
-
-	sysfs_put_active(attr_sd);
-
-	return rc;
-}
-
-static ssize_t
-read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
-{
 	struct bin_buffer *bb = file->private_data;
 	int size = file_inode(file)->i_size;
 	loff_t offs = *off;
@@ -88,8 +70,20 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 	if (!buf)
 		return -ENOMEM;
 
+	/* need attr_sd for attr, its parent for kobj */
 	mutex_lock(&bb->mutex);
-	count = fill_read(file, buf, offs, count);
+	if (!sysfs_get_active(attr_sd)) {
+		count = -ENODEV;
+		mutex_unlock(&bb->mutex);
+		goto out_free;
+	}
+
+	if (attr->read)
+		count = attr->read(file, kobj, attr, buf, offs, count);
+	else
+		count = -EIO;
+
+	sysfs_put_active(attr_sd);
 	mutex_unlock(&bb->mutex);
 
 	if (count < 0)
-- 
cgit v1.3


From f9b9a6217cf10fd5d3002627cc13c4789a777213 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Oct 2013 17:42:05 -0400
Subject: sysfs: prepare path write for unified regular / bin file handling

sysfs bin file handling will be merged into the regular file support.
This patch prepares the write path.

bin file write is almost identical to regular file write except that
the write length is capped by the inode size and @off is passed to the
write method.  This patch adds bin file handling to sysfs_write_file()
so that it can handle both regular and bin files.

A new file_operations struct sysfs_bin_operations is added, which
currently only hosts sysfs_write_file() and generic_file_llseek().
This isn't used yet but will eventually replace fs/sysfs/bin.c.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c  | 40 ++++++++++++++++++++++++++++++++++------
 fs/sysfs/sysfs.h |  1 +
 2 files changed, 35 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 4921bda3a37a..b36473f21824 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -54,6 +54,11 @@ struct sysfs_open_file {
 	struct list_head	list;
 };
 
+static bool sysfs_is_bin(struct sysfs_dirent *sd)
+{
+	return sysfs_type(sd) == SYSFS_KOBJ_BIN_ATTR;
+}
+
 static struct sysfs_open_file *sysfs_of(struct file *file)
 {
 	return ((struct seq_file *)file->private_data)->private;
@@ -138,16 +143,16 @@ static int sysfs_seq_show(struct seq_file *sf, void *v)
  * flush_write_buffer - push buffer to kobject
  * @of: open file
  * @buf: data buffer for file
+ * @off: file offset to write to
  * @count: number of bytes
  *
  * Get the correct pointers for the kobject and the attribute we're dealing
  * with, then call the store() method for it with @buf.
  */
-static int flush_write_buffer(struct sysfs_open_file *of, char *buf,
+static int flush_write_buffer(struct sysfs_open_file *of, char *buf, loff_t off,
 			      size_t count)
 {
 	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
-	const struct sysfs_ops *ops;
 	int rc = 0;
 
 	/*
@@ -161,8 +166,18 @@ static int flush_write_buffer(struct sysfs_open_file *of, char *buf,
 		return -ENODEV;
 	}
 
-	ops = sysfs_file_ops(of->sd);
-	rc = ops->store(kobj, of->sd->s_attr.attr, buf, count);
+	if (sysfs_is_bin(of->sd)) {
+		struct bin_attribute *battr = of->sd->s_bin_attr.bin_attr;
+
+		rc = -EIO;
+		if (battr->write)
+			rc = battr->write(of->file, kobj, battr, buf, off,
+					  count);
+	} else {
+		const struct sysfs_ops *ops = sysfs_file_ops(of->sd);
+
+		rc = ops->store(kobj, of->sd->s_attr.attr, buf, count);
+	}
 
 	sysfs_put_active(of->sd);
 	mutex_unlock(&of->mutex);
@@ -190,9 +205,17 @@ static ssize_t sysfs_write_file(struct file *file, const char __user *user_buf,
 				size_t count, loff_t *ppos)
 {
 	struct sysfs_open_file *of = sysfs_of(file);
-	ssize_t len = min_t(size_t, count, PAGE_SIZE - 1);
+	ssize_t len = min_t(size_t, count, PAGE_SIZE);
 	char *buf;
 
+	if (sysfs_is_bin(of->sd)) {
+		loff_t size = file_inode(file)->i_size;
+
+		if (size <= *ppos)
+			return 0;
+		len = min_t(ssize_t, len, size - *ppos);
+	}
+
 	if (!len)
 		return 0;
 
@@ -206,7 +229,7 @@ static ssize_t sysfs_write_file(struct file *file, const char __user *user_buf,
 	}
 	buf[len] = '\0';	/* guarantee string termination */
 
-	len = flush_write_buffer(of, buf, len);
+	len = flush_write_buffer(of, buf, *ppos, len);
 	if (len > 0)
 		*ppos += len;
 out_free:
@@ -471,6 +494,11 @@ const struct file_operations sysfs_file_operations = {
 	.poll		= sysfs_poll,
 };
 
+const struct file_operations sysfs_bin_operations = {
+	.write		= sysfs_write_file,
+	.llseek		= generic_file_llseek,
+};
+
 int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 			   const struct attribute *attr, int type,
 			   umode_t amode, const void *ns)
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 4b1d8258b071..f103bac53df7 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -212,6 +212,7 @@ int sysfs_inode_init(void);
  * file.c
  */
 extern const struct file_operations sysfs_file_operations;
+extern const struct file_operations sysfs_bin_operations;
 
 int sysfs_add_file(struct sysfs_dirent *dir_sd,
 		   const struct attribute *attr, int type);
-- 
cgit v1.3


From 2f0c6b7593a590eef7fa35344da57380fcee7581 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Oct 2013 17:42:06 -0400
Subject: sysfs: add sysfs_bin_read()

sysfs bin file handling will be merged into the regular file support.
This patch prepares the read path.

Copy fs/sysfs/bin.c::read() to fs/sysfs/file.c and make it use
sysfs_open_file instead of bin_buffer.  The function is identical copy
except for the use of sysfs_open_file.

The new function is added to sysfs_bin_operations.  This isn't used
yet but will eventually replace fs/sysfs/bin.c.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index b36473f21824..9ba492a3d932 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -139,6 +139,70 @@ static int sysfs_seq_show(struct seq_file *sf, void *v)
 	return 0;
 }
 
+/*
+ * Read method for bin files.  As reading a bin file can have side-effects,
+ * the exact offset and bytes specified in read(2) call should be passed to
+ * the read callback making it difficult to use seq_file.  Implement
+ * simplistic custom buffering for bin files.
+ */
+static ssize_t sysfs_bin_read(struct file *file, char __user *userbuf,
+			      size_t bytes, loff_t *off)
+{
+	struct sysfs_open_file *of = sysfs_of(file);
+	struct bin_attribute *battr = of->sd->s_bin_attr.bin_attr;
+	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
+	int size = file_inode(file)->i_size;
+	int count = min_t(size_t, bytes, PAGE_SIZE);
+	loff_t offs = *off;
+	char *buf;
+
+	if (!bytes)
+		return 0;
+
+	if (size) {
+		if (offs > size)
+			return 0;
+		if (offs + count > size)
+			count = size - offs;
+	}
+
+	buf = kmalloc(count, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/* need of->sd for battr, its parent for kobj */
+	mutex_lock(&of->mutex);
+	if (!sysfs_get_active(of->sd)) {
+		count = -ENODEV;
+		mutex_unlock(&of->mutex);
+		goto out_free;
+	}
+
+	if (battr->read)
+		count = battr->read(file, kobj, battr, buf, offs, count);
+	else
+		count = -EIO;
+
+	sysfs_put_active(of->sd);
+	mutex_unlock(&of->mutex);
+
+	if (count < 0)
+		goto out_free;
+
+	if (copy_to_user(userbuf, buf, count)) {
+		count = -EFAULT;
+		goto out_free;
+	}
+
+	pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count);
+
+	*off = offs + count;
+
+ out_free:
+	kfree(buf);
+	return count;
+}
+
 /**
  * flush_write_buffer - push buffer to kobject
  * @of: open file
@@ -495,6 +559,7 @@ const struct file_operations sysfs_file_operations = {
 };
 
 const struct file_operations sysfs_bin_operations = {
+	.read		= sysfs_bin_read,
 	.write		= sysfs_write_file,
 	.llseek		= generic_file_llseek,
 };
-- 
cgit v1.3


From 73d9714627adced2942e8d53ce0e73d9699a996c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Oct 2013 17:42:07 -0400
Subject: sysfs: copy bin mmap support from fs/sysfs/bin.c to fs/sysfs/file.c

sysfs bin file handling will be merged into the regular file support.
This patch copies mmap support from bin so that fs/sysfs/file.c can
handle mmapping bin files.

The code is copied mostly verbatim with the following updates.

* ->mmapped and ->vm_ops are added to sysfs_open_file and bin_buffer
  references are replaced with sysfs_open_file ones.

* Symbols are prefixed with sysfs_.

* sysfs_unmap_bin_file() grabs sysfs_open_dirent and traverses
  ->files.  Invocation of this function is added to
  sysfs_addrm_finish().

* sysfs_bin_mmap() is added to sysfs_bin_operations.

This is a preparation and the new mmap path isn't used yet.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c   |   1 +
 fs/sysfs/file.c  | 247 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/sysfs/sysfs.h |   2 +
 3 files changed, 249 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index b518afd0d11e..c4040ddb8308 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -595,6 +595,7 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
 		acxt->removed = sd->u.removed_list;
 
 		sysfs_deactivate(sd);
+		sysfs_unmap_bin_file(sd);
 		unmap_bin_file(sd);
 		sysfs_put(sd);
 	}
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 9ba492a3d932..02797a134cf8 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -22,6 +22,7 @@
 #include <linux/limits.h>
 #include <linux/uaccess.h>
 #include <linux/seq_file.h>
+#include <linux/mm.h>
 
 #include "sysfs.h"
 
@@ -52,6 +53,9 @@ struct sysfs_open_file {
 	struct mutex		mutex;
 	int			event;
 	struct list_head	list;
+
+	bool			mmapped;
+	const struct vm_operations_struct *vm_ops;
 };
 
 static bool sysfs_is_bin(struct sysfs_dirent *sd)
@@ -301,6 +305,218 @@ out_free:
 	return len;
 }
 
+static void sysfs_bin_vma_open(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct sysfs_open_file *of = sysfs_of(file);
+
+	if (!of->vm_ops)
+		return;
+
+	if (!sysfs_get_active(of->sd))
+		return;
+
+	if (of->vm_ops->open)
+		of->vm_ops->open(vma);
+
+	sysfs_put_active(of->sd);
+}
+
+static int sysfs_bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct file *file = vma->vm_file;
+	struct sysfs_open_file *of = sysfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return VM_FAULT_SIGBUS;
+
+	if (!sysfs_get_active(of->sd))
+		return VM_FAULT_SIGBUS;
+
+	ret = VM_FAULT_SIGBUS;
+	if (of->vm_ops->fault)
+		ret = of->vm_ops->fault(vma, vmf);
+
+	sysfs_put_active(of->sd);
+	return ret;
+}
+
+static int sysfs_bin_page_mkwrite(struct vm_area_struct *vma,
+				  struct vm_fault *vmf)
+{
+	struct file *file = vma->vm_file;
+	struct sysfs_open_file *of = sysfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return VM_FAULT_SIGBUS;
+
+	if (!sysfs_get_active(of->sd))
+		return VM_FAULT_SIGBUS;
+
+	ret = 0;
+	if (of->vm_ops->page_mkwrite)
+		ret = of->vm_ops->page_mkwrite(vma, vmf);
+	else
+		file_update_time(file);
+
+	sysfs_put_active(of->sd);
+	return ret;
+}
+
+static int sysfs_bin_access(struct vm_area_struct *vma, unsigned long addr,
+			    void *buf, int len, int write)
+{
+	struct file *file = vma->vm_file;
+	struct sysfs_open_file *of = sysfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return -EINVAL;
+
+	if (!sysfs_get_active(of->sd))
+		return -EINVAL;
+
+	ret = -EINVAL;
+	if (of->vm_ops->access)
+		ret = of->vm_ops->access(vma, addr, buf, len, write);
+
+	sysfs_put_active(of->sd);
+	return ret;
+}
+
+#ifdef CONFIG_NUMA
+static int sysfs_bin_set_policy(struct vm_area_struct *vma,
+				struct mempolicy *new)
+{
+	struct file *file = vma->vm_file;
+	struct sysfs_open_file *of = sysfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return 0;
+
+	if (!sysfs_get_active(of->sd))
+		return -EINVAL;
+
+	ret = 0;
+	if (of->vm_ops->set_policy)
+		ret = of->vm_ops->set_policy(vma, new);
+
+	sysfs_put_active(of->sd);
+	return ret;
+}
+
+static struct mempolicy *sysfs_bin_get_policy(struct vm_area_struct *vma,
+					      unsigned long addr)
+{
+	struct file *file = vma->vm_file;
+	struct sysfs_open_file *of = sysfs_of(file);
+	struct mempolicy *pol;
+
+	if (!of->vm_ops)
+		return vma->vm_policy;
+
+	if (!sysfs_get_active(of->sd))
+		return vma->vm_policy;
+
+	pol = vma->vm_policy;
+	if (of->vm_ops->get_policy)
+		pol = of->vm_ops->get_policy(vma, addr);
+
+	sysfs_put_active(of->sd);
+	return pol;
+}
+
+static int sysfs_bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
+			     const nodemask_t *to, unsigned long flags)
+{
+	struct file *file = vma->vm_file;
+	struct sysfs_open_file *of = sysfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return 0;
+
+	if (!sysfs_get_active(of->sd))
+		return 0;
+
+	ret = 0;
+	if (of->vm_ops->migrate)
+		ret = of->vm_ops->migrate(vma, from, to, flags);
+
+	sysfs_put_active(of->sd);
+	return ret;
+}
+#endif
+
+static const struct vm_operations_struct sysfs_bin_vm_ops = {
+	.open		= sysfs_bin_vma_open,
+	.fault		= sysfs_bin_fault,
+	.page_mkwrite	= sysfs_bin_page_mkwrite,
+	.access		= sysfs_bin_access,
+#ifdef CONFIG_NUMA
+	.set_policy	= sysfs_bin_set_policy,
+	.get_policy	= sysfs_bin_get_policy,
+	.migrate	= sysfs_bin_migrate,
+#endif
+};
+
+static int sysfs_bin_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct sysfs_open_file *of = sysfs_of(file);
+	struct bin_attribute *battr = of->sd->s_bin_attr.bin_attr;
+	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
+	int rc;
+
+	mutex_lock(&of->mutex);
+
+	/* need of->sd for battr, its parent for kobj */
+	rc = -ENODEV;
+	if (!sysfs_get_active(of->sd))
+		goto out_unlock;
+
+	rc = -EINVAL;
+	if (!battr->mmap)
+		goto out_put;
+
+	rc = battr->mmap(file, kobj, battr, vma);
+	if (rc)
+		goto out_put;
+
+	/*
+	 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
+	 * to satisfy versions of X which crash if the mmap fails: that
+	 * substitutes a new vm_file, and we don't then want bin_vm_ops.
+	 */
+	if (vma->vm_file != file)
+		goto out_put;
+
+	rc = -EINVAL;
+	if (of->mmapped && of->vm_ops != vma->vm_ops)
+		goto out_put;
+
+	/*
+	 * It is not possible to successfully wrap close.
+	 * So error if someone is trying to use close.
+	 */
+	rc = -EINVAL;
+	if (vma->vm_ops && vma->vm_ops->close)
+		goto out_put;
+
+	rc = 0;
+	of->mmapped = 1;
+	of->vm_ops = vma->vm_ops;
+	vma->vm_ops = &sysfs_bin_vm_ops;
+out_put:
+	sysfs_put_active(of->sd);
+out_unlock:
+	mutex_unlock(&of->mutex);
+
+	return rc;
+}
+
 /**
  *	sysfs_get_open_dirent - get or create sysfs_open_dirent
  *	@sd: target sysfs_dirent
@@ -375,7 +591,9 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
 	mutex_lock(&sysfs_open_file_mutex);
 	spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
 
-	list_del(&of->list);
+	if (of)
+		list_del(&of->list);
+
 	if (atomic_dec_and_test(&od->refcnt))
 		sd->s_attr.open = NULL;
 	else
@@ -477,6 +695,32 @@ static int sysfs_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+void sysfs_unmap_bin_file(struct sysfs_dirent *sd)
+{
+	struct sysfs_open_dirent *od;
+	struct sysfs_open_file *of;
+
+	if (!sysfs_is_bin(sd))
+		return;
+
+	spin_lock_irq(&sysfs_open_dirent_lock);
+	od = sd->s_attr.open;
+	if (od)
+		atomic_inc(&od->refcnt);
+	spin_unlock_irq(&sysfs_open_dirent_lock);
+	if (!od)
+		return;
+
+	mutex_lock(&sysfs_open_file_mutex);
+	list_for_each_entry(of, &od->files, list) {
+		struct inode *inode = file_inode(of->file);
+		unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+	}
+	mutex_unlock(&sysfs_open_file_mutex);
+
+	sysfs_put_open_dirent(sd, NULL);
+}
+
 /* Sysfs attribute files are pollable.  The idea is that you read
  * the content and then you use 'poll' or 'select' to wait for
  * the content to change.  When the content changes (assuming the
@@ -562,6 +806,7 @@ const struct file_operations sysfs_bin_operations = {
 	.read		= sysfs_bin_read,
 	.write		= sysfs_write_file,
 	.llseek		= generic_file_llseek,
+	.mmap		= sysfs_bin_mmap,
 };
 
 int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index f103bac53df7..c960d6272cf6 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -220,6 +220,8 @@ int sysfs_add_file(struct sysfs_dirent *dir_sd,
 int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 			   const struct attribute *attr, int type,
 			   umode_t amode, const void *ns);
+void sysfs_unmap_bin_file(struct sysfs_dirent *sd);
+
 /*
  * bin.c
  */
-- 
cgit v1.3


From 49fe604781cbb03eb6ff12a7bc4ad8eef8b830c4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Oct 2013 17:42:08 -0400
Subject: sysfs: prepare open path for unified regular / bin file handling

sysfs bin file handling will be merged into the regular file support.
This patch prepares the open path.

This patch updates sysfs_open_file() such that it can handle both
regular and bin files.

This is a preparation and the new bin file path isn't used yet.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c | 58 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 33 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 02797a134cf8..417d005955d9 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -610,38 +610,40 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
 	struct sysfs_open_file *of;
-	const struct sysfs_ops *ops;
+	bool has_read, has_write;
 	int error = -EACCES;
 
 	/* need attr_sd for attr and ops, its parent for kobj */
 	if (!sysfs_get_active(attr_sd))
 		return -ENODEV;
 
-	/* every kobject with an attribute needs a ktype assigned */
-	ops = sysfs_file_ops(attr_sd);
-	if (WARN(!ops, KERN_ERR
-		 "missing sysfs attribute operations for kobject: %s\n",
-		 kobject_name(kobj)))
-		goto err_out;
+	if (sysfs_is_bin(attr_sd)) {
+		struct bin_attribute *battr = attr_sd->s_bin_attr.bin_attr;
 
-	/* File needs write support.
-	 * The inode's perms must say it's ok,
-	 * and we must have a store method.
-	 */
-	if (file->f_mode & FMODE_WRITE) {
-		if (!(inode->i_mode & S_IWUGO) || !ops->store)
-			goto err_out;
-	}
+		has_read = battr->read || battr->mmap;
+		has_write = battr->write || battr->mmap;
+	} else {
+		const struct sysfs_ops *ops = sysfs_file_ops(attr_sd);
 
-	/* File needs read support.
-	 * The inode's perms must say it's ok, and we there
-	 * must be a show method for it.
-	 */
-	if (file->f_mode & FMODE_READ) {
-		if (!(inode->i_mode & S_IRUGO) || !ops->show)
+		/* every kobject with an attribute needs a ktype assigned */
+		if (WARN(!ops, KERN_ERR
+			 "missing sysfs attribute operations for kobject: %s\n",
+			 kobject_name(kobj)))
 			goto err_out;
+
+		has_read = ops->show;
+		has_write = ops->store;
 	}
 
+	/* check perms and supported operations */
+	if ((file->f_mode & FMODE_WRITE) &&
+	    (!(inode->i_mode & S_IWUGO) || !has_write))
+		goto err_out;
+
+	if ((file->f_mode & FMODE_READ) &&
+	    (!(inode->i_mode & S_IRUGO) || !has_read))
+		goto err_out;
+
 	/* allocate a sysfs_open_file for the file */
 	error = -ENOMEM;
 	of = kzalloc(sizeof(struct sysfs_open_file), GFP_KERNEL);
@@ -653,11 +655,14 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	of->file = file;
 
 	/*
-	 * Always instantiate seq_file even if read access is not
-	 * implemented or requested.  This unifies private data access and
-	 * most files are readable anyway.
+	 * Always instantiate seq_file even if read access doesn't use
+	 * seq_file or is not requested.  This unifies private data access
+	 * and readable regular files are the vast majority anyway.
 	 */
-	error = single_open(file, sysfs_seq_show, of);
+	if (sysfs_is_bin(attr_sd))
+		error = single_open(file, NULL, of);
+	else
+		error = single_open(file, sysfs_seq_show, of);
 	if (error)
 		goto err_free;
 
@@ -807,6 +812,9 @@ const struct file_operations sysfs_bin_operations = {
 	.write		= sysfs_write_file,
 	.llseek		= generic_file_llseek,
 	.mmap		= sysfs_bin_mmap,
+	.open		= sysfs_open_file,
+	.release	= sysfs_release,
+	.poll		= sysfs_poll,
 };
 
 int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
-- 
cgit v1.3


From 3124eb1679b28726eacbc8973a891235dca3ed99 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Oct 2013 17:42:09 -0400
Subject: sysfs: merge regular and bin file handling

With the previous changes, sysfs regular file code is ready to handle
bin files too.  This patch makes bin files share the regular file
path.

* sysfs_create/remove_bin_file() are moved to fs/sysfs/file.c.

* sysfs_init_inode() is updated to use the new sysfs_bin_operations
  instead of bin_fops for bin files.

* fs/sysfs/bin.c and the related pieces are removed.

This patch shouldn't introduce any behavior difference to bin file
accesses.

Overall, this unification reduces the amount of duplicate logic, makes
behaviors more consistent and paves the road for building simpler and
more versatile interface which will allow other subsystems to make use
of sysfs for their pseudo filesystems.

v2: Stale fs/sysfs/bin.c reference dropped from
    Documentation/DocBook/filesystems.tmpl.  Reported by kbuild test
    robot.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay@vrfy.org>
Cc: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/DocBook/filesystems.tmpl |   1 -
 fs/sysfs/Makefile                      |   3 +-
 fs/sysfs/bin.c                         | 491 ---------------------------------
 fs/sysfs/dir.c                         |   1 -
 fs/sysfs/file.c                        |  26 ++
 fs/sysfs/inode.c                       |   2 +-
 fs/sysfs/sysfs.h                       |   6 -
 7 files changed, 28 insertions(+), 502 deletions(-)
 delete mode 100644 fs/sysfs/bin.c

(limited to 'fs')

diff --git a/Documentation/DocBook/filesystems.tmpl b/Documentation/DocBook/filesystems.tmpl
index 25b58efd955d..4f676838da06 100644
--- a/Documentation/DocBook/filesystems.tmpl
+++ b/Documentation/DocBook/filesystems.tmpl
@@ -91,7 +91,6 @@
      <title>The Filesystem for Exporting Kernel Objects</title>
 !Efs/sysfs/file.c
 !Efs/sysfs/symlink.c
-!Efs/sysfs/bin.c
   </chapter>
 
   <chapter id="debugfs">
diff --git a/fs/sysfs/Makefile b/fs/sysfs/Makefile
index 7a1ceb946b80..8876ac183373 100644
--- a/fs/sysfs/Makefile
+++ b/fs/sysfs/Makefile
@@ -2,5 +2,4 @@
 # Makefile for the sysfs virtual filesystem
 #
 
-obj-y		:= inode.o file.o dir.o symlink.o mount.o bin.o \
-		   group.o
+obj-y		:= inode.o file.o dir.o symlink.o mount.o group.o
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
deleted file mode 100644
index 60a4e787dc4f..000000000000
--- a/fs/sysfs/bin.c
+++ /dev/null
@@ -1,491 +0,0 @@
-/*
- * fs/sysfs/bin.c - sysfs binary file implementation
- *
- * Copyright (c) 2003 Patrick Mochel
- * Copyright (c) 2003 Matthew Wilcox
- * Copyright (c) 2004 Silicon Graphics, Inc.
- * Copyright (c) 2007 SUSE Linux Products GmbH
- * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
- *
- * This file is released under the GPLv2.
- *
- * Please see Documentation/filesystems/sysfs.txt for more information.
- */
-
-#undef DEBUG
-
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/kernel.h>
-#include <linux/kobject.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/mutex.h>
-#include <linux/mm.h>
-#include <linux/uaccess.h>
-
-#include "sysfs.h"
-
-/*
- * There's one bin_buffer for each open file.
- *
- * filp->private_data points to bin_buffer and
- * sysfs_dirent->s_bin_attr.buffers points to a the bin_buffer s
- * sysfs_dirent->s_bin_attr.buffers is protected by sysfs_bin_lock
- */
-static DEFINE_MUTEX(sysfs_bin_lock);
-
-struct bin_buffer {
-	struct mutex			mutex;
-	void				*buffer;
-	int				mmapped;
-	const struct vm_operations_struct *vm_ops;
-	struct file			*file;
-	struct hlist_node		list;
-};
-
-static ssize_t
-read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
-{
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
-	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-	struct bin_buffer *bb = file->private_data;
-	int size = file_inode(file)->i_size;
-	loff_t offs = *off;
-	int count = min_t(size_t, bytes, PAGE_SIZE);
-	char *buf;
-
-	if (!bytes)
-		return 0;
-
-	if (size) {
-		if (offs > size)
-			return 0;
-		if (offs + count > size)
-			count = size - offs;
-	}
-
-	buf = kmalloc(count, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	/* need attr_sd for attr, its parent for kobj */
-	mutex_lock(&bb->mutex);
-	if (!sysfs_get_active(attr_sd)) {
-		count = -ENODEV;
-		mutex_unlock(&bb->mutex);
-		goto out_free;
-	}
-
-	if (attr->read)
-		count = attr->read(file, kobj, attr, buf, offs, count);
-	else
-		count = -EIO;
-
-	sysfs_put_active(attr_sd);
-	mutex_unlock(&bb->mutex);
-
-	if (count < 0)
-		goto out_free;
-
-	if (copy_to_user(userbuf, buf, count)) {
-		count = -EFAULT;
-		goto out_free;
-	}
-
-	pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count);
-
-	*off = offs + count;
-
- out_free:
-	kfree(buf);
-	return count;
-}
-
-static int
-flush_write(struct file *file, char *buffer, loff_t offset, size_t count)
-{
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
-	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-	int rc;
-
-	/* need attr_sd for attr, its parent for kobj */
-	if (!sysfs_get_active(attr_sd))
-		return -ENODEV;
-
-	rc = -EIO;
-	if (attr->write)
-		rc = attr->write(file, kobj, attr, buffer, offset, count);
-
-	sysfs_put_active(attr_sd);
-
-	return rc;
-}
-
-static ssize_t write(struct file *file, const char __user *userbuf,
-		     size_t bytes, loff_t *off)
-{
-	struct bin_buffer *bb = file->private_data;
-	int size = file_inode(file)->i_size;
-	loff_t offs = *off;
-	int count = min_t(size_t, bytes, PAGE_SIZE);
-	char *temp;
-
-	if (!bytes)
-		return 0;
-
-	if (size) {
-		if (offs > size)
-			return 0;
-		if (offs + count > size)
-			count = size - offs;
-	}
-
-	temp = memdup_user(userbuf, count);
-	if (IS_ERR(temp))
-		return PTR_ERR(temp);
-
-	mutex_lock(&bb->mutex);
-
-	memcpy(bb->buffer, temp, count);
-
-	count = flush_write(file, bb->buffer, offs, count);
-	mutex_unlock(&bb->mutex);
-
-	if (count > 0)
-		*off = offs + count;
-
-	kfree(temp);
-	return count;
-}
-
-static void bin_vma_open(struct vm_area_struct *vma)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-
-	if (!bb->vm_ops)
-		return;
-
-	if (!sysfs_get_active(attr_sd))
-		return;
-
-	if (bb->vm_ops->open)
-		bb->vm_ops->open(vma);
-
-	sysfs_put_active(attr_sd);
-}
-
-static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	int ret;
-
-	if (!bb->vm_ops)
-		return VM_FAULT_SIGBUS;
-
-	if (!sysfs_get_active(attr_sd))
-		return VM_FAULT_SIGBUS;
-
-	ret = VM_FAULT_SIGBUS;
-	if (bb->vm_ops->fault)
-		ret = bb->vm_ops->fault(vma, vmf);
-
-	sysfs_put_active(attr_sd);
-	return ret;
-}
-
-static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	int ret;
-
-	if (!bb->vm_ops)
-		return VM_FAULT_SIGBUS;
-
-	if (!sysfs_get_active(attr_sd))
-		return VM_FAULT_SIGBUS;
-
-	ret = 0;
-	if (bb->vm_ops->page_mkwrite)
-		ret = bb->vm_ops->page_mkwrite(vma, vmf);
-	else
-		file_update_time(file);
-
-	sysfs_put_active(attr_sd);
-	return ret;
-}
-
-static int bin_access(struct vm_area_struct *vma, unsigned long addr,
-		  void *buf, int len, int write)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	int ret;
-
-	if (!bb->vm_ops)
-		return -EINVAL;
-
-	if (!sysfs_get_active(attr_sd))
-		return -EINVAL;
-
-	ret = -EINVAL;
-	if (bb->vm_ops->access)
-		ret = bb->vm_ops->access(vma, addr, buf, len, write);
-
-	sysfs_put_active(attr_sd);
-	return ret;
-}
-
-#ifdef CONFIG_NUMA
-static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	int ret;
-
-	if (!bb->vm_ops)
-		return 0;
-
-	if (!sysfs_get_active(attr_sd))
-		return -EINVAL;
-
-	ret = 0;
-	if (bb->vm_ops->set_policy)
-		ret = bb->vm_ops->set_policy(vma, new);
-
-	sysfs_put_active(attr_sd);
-	return ret;
-}
-
-static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
-					unsigned long addr)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	struct mempolicy *pol;
-
-	if (!bb->vm_ops)
-		return vma->vm_policy;
-
-	if (!sysfs_get_active(attr_sd))
-		return vma->vm_policy;
-
-	pol = vma->vm_policy;
-	if (bb->vm_ops->get_policy)
-		pol = bb->vm_ops->get_policy(vma, addr);
-
-	sysfs_put_active(attr_sd);
-	return pol;
-}
-
-static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
-			const nodemask_t *to, unsigned long flags)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	int ret;
-
-	if (!bb->vm_ops)
-		return 0;
-
-	if (!sysfs_get_active(attr_sd))
-		return 0;
-
-	ret = 0;
-	if (bb->vm_ops->migrate)
-		ret = bb->vm_ops->migrate(vma, from, to, flags);
-
-	sysfs_put_active(attr_sd);
-	return ret;
-}
-#endif
-
-static const struct vm_operations_struct bin_vm_ops = {
-	.open		= bin_vma_open,
-	.fault		= bin_fault,
-	.page_mkwrite	= bin_page_mkwrite,
-	.access		= bin_access,
-#ifdef CONFIG_NUMA
-	.set_policy	= bin_set_policy,
-	.get_policy	= bin_get_policy,
-	.migrate	= bin_migrate,
-#endif
-};
-
-static int mmap(struct file *file, struct vm_area_struct *vma)
-{
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
-	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-	int rc;
-
-	mutex_lock(&bb->mutex);
-
-	/* need attr_sd for attr, its parent for kobj */
-	rc = -ENODEV;
-	if (!sysfs_get_active(attr_sd))
-		goto out_unlock;
-
-	rc = -EINVAL;
-	if (!attr->mmap)
-		goto out_put;
-
-	rc = attr->mmap(file, kobj, attr, vma);
-	if (rc)
-		goto out_put;
-
-	/*
-	 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
-	 * to satisfy versions of X which crash if the mmap fails: that
-	 * substitutes a new vm_file, and we don't then want bin_vm_ops.
-	 */
-	if (vma->vm_file != file)
-		goto out_put;
-
-	rc = -EINVAL;
-	if (bb->mmapped && bb->vm_ops != vma->vm_ops)
-		goto out_put;
-
-	/*
-	 * It is not possible to successfully wrap close.
-	 * So error if someone is trying to use close.
-	 */
-	rc = -EINVAL;
-	if (vma->vm_ops && vma->vm_ops->close)
-		goto out_put;
-
-	rc = 0;
-	bb->mmapped = 1;
-	bb->vm_ops = vma->vm_ops;
-	vma->vm_ops = &bin_vm_ops;
-out_put:
-	sysfs_put_active(attr_sd);
-out_unlock:
-	mutex_unlock(&bb->mutex);
-
-	return rc;
-}
-
-static int open(struct inode *inode, struct file *file)
-{
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
-	struct bin_buffer *bb = NULL;
-	int error;
-
-	/* binary file operations requires both @sd and its parent */
-	if (!sysfs_get_active(attr_sd))
-		return -ENODEV;
-
-	error = -EACCES;
-	if ((file->f_mode & FMODE_WRITE) && !(attr->write || attr->mmap))
-		goto err_out;
-	if ((file->f_mode & FMODE_READ) && !(attr->read || attr->mmap))
-		goto err_out;
-
-	error = -ENOMEM;
-	bb = kzalloc(sizeof(*bb), GFP_KERNEL);
-	if (!bb)
-		goto err_out;
-
-	bb->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!bb->buffer)
-		goto err_out;
-
-	mutex_init(&bb->mutex);
-	bb->file = file;
-	file->private_data = bb;
-
-	mutex_lock(&sysfs_bin_lock);
-	hlist_add_head(&bb->list, &attr_sd->s_bin_attr.buffers);
-	mutex_unlock(&sysfs_bin_lock);
-
-	/* open succeeded, put active references */
-	sysfs_put_active(attr_sd);
-	return 0;
-
- err_out:
-	sysfs_put_active(attr_sd);
-	kfree(bb);
-	return error;
-}
-
-static int release(struct inode *inode, struct file *file)
-{
-	struct bin_buffer *bb = file->private_data;
-
-	mutex_lock(&sysfs_bin_lock);
-	hlist_del(&bb->list);
-	mutex_unlock(&sysfs_bin_lock);
-
-	kfree(bb->buffer);
-	kfree(bb);
-	return 0;
-}
-
-const struct file_operations bin_fops = {
-	.read		= read,
-	.write		= write,
-	.mmap		= mmap,
-	.llseek		= generic_file_llseek,
-	.open		= open,
-	.release	= release,
-};
-
-
-void unmap_bin_file(struct sysfs_dirent *attr_sd)
-{
-	struct bin_buffer *bb;
-
-	if (sysfs_type(attr_sd) != SYSFS_KOBJ_BIN_ATTR)
-		return;
-
-	mutex_lock(&sysfs_bin_lock);
-
-	hlist_for_each_entry(bb, &attr_sd->s_bin_attr.buffers, list) {
-		struct inode *inode = file_inode(bb->file);
-
-		unmap_mapping_range(inode->i_mapping, 0, 0, 1);
-	}
-
-	mutex_unlock(&sysfs_bin_lock);
-}
-
-/**
- *	sysfs_create_bin_file - create binary file for object.
- *	@kobj:	object.
- *	@attr:	attribute descriptor.
- */
-int sysfs_create_bin_file(struct kobject *kobj,
-			  const struct bin_attribute *attr)
-{
-	BUG_ON(!kobj || !kobj->sd || !attr);
-
-	return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR);
-}
-EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
-
-/**
- *	sysfs_remove_bin_file - remove binary file for object.
- *	@kobj:	object.
- *	@attr:	attribute descriptor.
- */
-void sysfs_remove_bin_file(struct kobject *kobj,
-			   const struct bin_attribute *attr)
-{
-	sysfs_hash_and_remove(kobj->sd, attr->attr.name, NULL);
-}
-EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index c4040ddb8308..f6025c81bfd5 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -596,7 +596,6 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
 
 		sysfs_deactivate(sd);
 		sysfs_unmap_bin_file(sd);
-		unmap_bin_file(sd);
 		sysfs_put(sd);
 	}
 }
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 417d005955d9..5f7a955550de 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -987,6 +987,32 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);
 
+/**
+ *	sysfs_create_bin_file - create binary file for object.
+ *	@kobj:	object.
+ *	@attr:	attribute descriptor.
+ */
+int sysfs_create_bin_file(struct kobject *kobj,
+			  const struct bin_attribute *attr)
+{
+	BUG_ON(!kobj || !kobj->sd || !attr);
+
+	return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR);
+}
+EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
+
+/**
+ *	sysfs_remove_bin_file - remove binary file for object.
+ *	@kobj:	object.
+ *	@attr:	attribute descriptor.
+ */
+void sysfs_remove_bin_file(struct kobject *kobj,
+			   const struct bin_attribute *attr)
+{
+	sysfs_hash_and_remove(kobj->sd, attr->attr.name, NULL);
+}
+EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
+
 struct sysfs_schedule_callback_struct {
 	struct list_head	workq_list;
 	struct kobject		*kobj;
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 63f755ef71dd..2cb1b6b8ccbc 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -260,7 +260,7 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 	case SYSFS_KOBJ_BIN_ATTR:
 		bin_attr = sd->s_bin_attr.bin_attr;
 		inode->i_size = bin_attr->size;
-		inode->i_fop = &bin_fops;
+		inode->i_fop = &sysfs_bin_operations;
 		break;
 	case SYSFS_KOBJ_LINK:
 		inode->i_op = &sysfs_symlink_inode_operations;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index c960d6272cf6..4e01d3b3909c 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -222,12 +222,6 @@ int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
 			   umode_t amode, const void *ns);
 void sysfs_unmap_bin_file(struct sysfs_dirent *sd);
 
-/*
- * bin.c
- */
-extern const struct file_operations bin_fops;
-void unmap_bin_file(struct sysfs_dirent *attr_sd);
-
 /*
  * symlink.c
  */
-- 
cgit v1.3


From e479556bfdd136669854292eb57ed0139d7253d5 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Fri, 27 Sep 2013 18:08:30 +0800
Subject: f2fs: use rw_sem instead of fs_lock(locks mutex)

The fs_locks is used to block other ops(ex, recovery) when doing checkpoint.
And each other operate routine(besides checkpoint) needs to acquire a fs_lock,
there is a terrible problem here, if these are too many concurrency threads acquiring
fs_lock, so that they will block each other and may lead to some performance problem,
but this is not the phenomenon we want to see.
Though there are some optimization patches introduced to enhance the usage of fs_lock,
but the thorough solution is using a *rw_sem* to replace the fs_lock.
Checkpoint routine takes write_sem, and other ops take read_sem, so that we can block
other ops(ex, recovery) when doing checkpoint, and other ops will not disturb each other,
this can avoid the problem described above completely.
Because of the weakness of rw_sem, the above change may introduce a potential problem
that the checkpoint thread might get starved if other threads are intensively locking
the read semaphore for I/O.(Pointed out by Xu Jin)
In order to avoid this, a wait_list is introduced, the appending read semaphore ops
will be dropped into the wait_list if checkpoint thread is waiting for write semaphore,
and will be waked up when checkpoint thread gives up write semaphore.
Thanks to Kim's previous review and test, and will be very glad to see other guys'
performance tests about this patch.

V2:
  -fix the potential starvation problem.
  -use more suitable func name suggested by Xu Jin.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
[Jaegeuk Kim: adjust minor coding standard]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c |  7 +++---
 fs/f2fs/data.c       | 11 ++++-----
 fs/f2fs/f2fs.h       | 64 ++++++++++++++++++----------------------------------
 fs/f2fs/file.c       | 37 ++++++++++++++----------------
 fs/f2fs/inode.c      | 11 ++++-----
 fs/f2fs/namei.c      | 50 ++++++++++++++++++++--------------------
 fs/f2fs/recovery.c   |  7 +++---
 fs/f2fs/super.c      |  5 ++--
 fs/f2fs/xattr.c      |  7 ++----
 9 files changed, 83 insertions(+), 116 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index ca3944240db5..d80882763ffd 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -619,11 +619,10 @@ static void block_operations(struct f2fs_sb_info *sbi)
 	blk_start_plug(&plug);
 
 retry_flush_dents:
-	mutex_lock_all(sbi);
-
+	f2fs_lock_all(sbi);
 	/* write all the dirty dentry pages */
 	if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
-		mutex_unlock_all(sbi);
+		f2fs_unlock_all(sbi);
 		sync_dirty_dir_inodes(sbi);
 		goto retry_flush_dents;
 	}
@@ -646,7 +645,7 @@ retry_flush_nodes:
 static void unblock_operations(struct f2fs_sb_info *sbi)
 {
 	mutex_unlock(&sbi->node_write);
-	mutex_unlock_all(sbi);
+	f2fs_unlock_all(sbi);
 }
 
 static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 941f9b9ca3a5..2535d3b1a763 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -560,9 +560,9 @@ write:
 		inode_dec_dirty_dents(inode);
 		err = do_write_data_page(page);
 	} else {
-		int ilock = mutex_lock_op(sbi);
+		f2fs_lock_op(sbi);
 		err = do_write_data_page(page);
-		mutex_unlock_op(sbi, ilock);
+		f2fs_unlock_op(sbi);
 		need_balance_fs = true;
 	}
 	if (err == -ENOENT)
@@ -641,7 +641,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 	pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
 	struct dnode_of_data dn;
 	int err = 0;
-	int ilock;
 
 	f2fs_balance_fs(sbi);
 repeat:
@@ -650,7 +649,7 @@ repeat:
 		return -ENOMEM;
 	*pagep = page;
 
-	ilock = mutex_lock_op(sbi);
+	f2fs_lock_op(sbi);
 
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
 	err = get_dnode_of_data(&dn, index, ALLOC_NODE);
@@ -664,7 +663,7 @@ repeat:
 	if (err)
 		goto err;
 
-	mutex_unlock_op(sbi, ilock);
+	f2fs_unlock_op(sbi);
 
 	if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
 		return 0;
@@ -700,7 +699,7 @@ out:
 	return 0;
 
 err:
-	mutex_unlock_op(sbi, ilock);
+	f2fs_unlock_op(sbi);
 	f2fs_put_page(page, 1);
 	return err;
 }
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 7fd99d8bd2ff..a955a59dfdbe 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -18,6 +18,8 @@
 #include <linux/crc32.h>
 #include <linux/magic.h>
 #include <linux/kobject.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
 
 /*
  * For mount options
@@ -317,14 +319,6 @@ enum count_type {
 	NR_COUNT_TYPE,
 };
 
-/*
- * Uses as sbi->fs_lock[NR_GLOBAL_LOCKS].
- * The checkpoint procedure blocks all the locks in this fs_lock array.
- * Some FS operations grab free locks, and if there is no free lock,
- * then wait to grab a lock in a round-robin manner.
- */
-#define NR_GLOBAL_LOCKS	8
-
 /*
  * The below are the page types of bios used in submti_bio().
  * The available types are:
@@ -365,10 +359,10 @@ struct f2fs_sb_info {
 	struct f2fs_checkpoint *ckpt;		/* raw checkpoint pointer */
 	struct inode *meta_inode;		/* cache meta blocks */
 	struct mutex cp_mutex;			/* checkpoint procedure lock */
-	struct mutex fs_lock[NR_GLOBAL_LOCKS];	/* blocking FS operations */
+	struct rw_semaphore cp_rwsem;		/* blocking FS operations */
+	wait_queue_head_t cp_wait;		/* checkpoint wait queue */
 	struct mutex node_write;		/* locking node writes */
 	struct mutex writepages;		/* mutex for writepages() */
-	unsigned char next_lock_num;		/* round-robin global locks */
 	int por_doing;				/* recovery is doing or not */
 	int on_build_free_nids;			/* build_free_nids is doing */
 
@@ -520,48 +514,34 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
 	cp->ckpt_flags = cpu_to_le32(ckpt_flags);
 }
 
-static inline void mutex_lock_all(struct f2fs_sb_info *sbi)
+static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
 {
-	int i;
-
-	for (i = 0; i < NR_GLOBAL_LOCKS; i++) {
-		/*
-		 * This is the only time we take multiple fs_lock[]
-		 * instances; the order is immaterial since we
-		 * always hold cp_mutex, which serializes multiple
-		 * such operations.
-		 */
-		mutex_lock_nest_lock(&sbi->fs_lock[i], &sbi->cp_mutex);
-	}
+	/*
+	 * If the checkpoint thread is waiting for cp_rwsem, add cuurent task
+	 * into wait list to avoid the checkpoint thread starvation
+	 */
+	while (!list_empty(&sbi->cp_rwsem.wait_list))
+		wait_event_interruptible(sbi->cp_wait,
+				list_empty(&sbi->cp_rwsem.wait_list));
+	down_read(&sbi->cp_rwsem);
 }
 
-static inline void mutex_unlock_all(struct f2fs_sb_info *sbi)
+static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
 {
-	int i = 0;
-	for (; i < NR_GLOBAL_LOCKS; i++)
-		mutex_unlock(&sbi->fs_lock[i]);
+	up_read(&sbi->cp_rwsem);
 }
 
-static inline int mutex_lock_op(struct f2fs_sb_info *sbi)
+static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
 {
-	unsigned char next_lock;
-	int i = 0;
-
-	for (; i < NR_GLOBAL_LOCKS; i++)
-		if (mutex_trylock(&sbi->fs_lock[i]))
-			return i;
-
-	next_lock = sbi->next_lock_num++ % NR_GLOBAL_LOCKS;
-	mutex_lock(&sbi->fs_lock[next_lock]);
-	return next_lock;
+	down_write_nest_lock(&sbi->cp_rwsem, &sbi->cp_mutex);
 }
 
-static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, int ilock)
+static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
 {
-	if (ilock < 0)
-		return;
-	BUG_ON(ilock >= NR_GLOBAL_LOCKS);
-	mutex_unlock(&sbi->fs_lock[ilock]);
+	up_write(&sbi->cp_rwsem);
+
+	/* wake up all tasks blocked by checkpoint */
+	wake_up_all(&sbi->cp_wait);
 }
 
 /*
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 02c906971cc6..c80faa2fff39 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -35,18 +35,18 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	block_t old_blk_addr;
 	struct dnode_of_data dn;
-	int err, ilock;
+	int err;
 
 	f2fs_balance_fs(sbi);
 
 	sb_start_pagefault(inode->i_sb);
 
 	/* block allocation */
-	ilock = mutex_lock_op(sbi);
+	f2fs_lock_op(sbi);
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
 	err = get_dnode_of_data(&dn, page->index, ALLOC_NODE);
 	if (err) {
-		mutex_unlock_op(sbi, ilock);
+		f2fs_unlock_op(sbi);
 		goto out;
 	}
 
@@ -56,12 +56,12 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 		err = reserve_new_block(&dn);
 		if (err) {
 			f2fs_put_dnode(&dn);
-			mutex_unlock_op(sbi, ilock);
+			f2fs_unlock_op(sbi);
 			goto out;
 		}
 	}
 	f2fs_put_dnode(&dn);
-	mutex_unlock_op(sbi, ilock);
+	f2fs_unlock_op(sbi);
 
 	file_update_time(vma->vm_file);
 	lock_page(page);
@@ -270,7 +270,7 @@ static int truncate_blocks(struct inode *inode, u64 from)
 	unsigned int blocksize = inode->i_sb->s_blocksize;
 	struct dnode_of_data dn;
 	pgoff_t free_from;
-	int count = 0, ilock = -1;
+	int count = 0;
 	int err;
 
 	trace_f2fs_truncate_blocks_enter(inode, from);
@@ -278,13 +278,13 @@ static int truncate_blocks(struct inode *inode, u64 from)
 	free_from = (pgoff_t)
 			((from + blocksize - 1) >> (sbi->log_blocksize));
 
-	ilock = mutex_lock_op(sbi);
+	f2fs_lock_op(sbi);
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
 	err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
 	if (err) {
 		if (err == -ENOENT)
 			goto free_next;
-		mutex_unlock_op(sbi, ilock);
+		f2fs_unlock_op(sbi);
 		trace_f2fs_truncate_blocks_exit(inode, err);
 		return err;
 	}
@@ -305,7 +305,7 @@ static int truncate_blocks(struct inode *inode, u64 from)
 	f2fs_put_dnode(&dn);
 free_next:
 	err = truncate_inode_blocks(inode, free_from);
-	mutex_unlock_op(sbi, ilock);
+	f2fs_unlock_op(sbi);
 
 	/* lastly zero out the first data page */
 	truncate_partial_data_page(inode, from);
@@ -416,16 +416,15 @@ static void fill_zero(struct inode *inode, pgoff_t index,
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct page *page;
-	int ilock;
 
 	if (!len)
 		return;
 
 	f2fs_balance_fs(sbi);
 
-	ilock = mutex_lock_op(sbi);
+	f2fs_lock_op(sbi);
 	page = get_new_data_page(inode, NULL, index, false);
-	mutex_unlock_op(sbi, ilock);
+	f2fs_unlock_op(sbi);
 
 	if (!IS_ERR(page)) {
 		wait_on_page_writeback(page);
@@ -484,7 +483,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode)
 			struct address_space *mapping = inode->i_mapping;
 			loff_t blk_start, blk_end;
 			struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-			int ilock;
 
 			f2fs_balance_fs(sbi);
 
@@ -493,9 +491,9 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode)
 			truncate_inode_pages_range(mapping, blk_start,
 					blk_end - 1);
 
-			ilock = mutex_lock_op(sbi);
+			f2fs_lock_op(sbi);
 			ret = truncate_hole(inode, pg_start, pg_end);
-			mutex_unlock_op(sbi, ilock);
+			f2fs_unlock_op(sbi);
 		}
 	}
 
@@ -529,13 +527,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 
 	for (index = pg_start; index <= pg_end; index++) {
 		struct dnode_of_data dn;
-		int ilock;
 
-		ilock = mutex_lock_op(sbi);
+		f2fs_lock_op(sbi);
 		set_new_dnode(&dn, inode, NULL, NULL, 0);
 		ret = get_dnode_of_data(&dn, index, ALLOC_NODE);
 		if (ret) {
-			mutex_unlock_op(sbi, ilock);
+			f2fs_unlock_op(sbi);
 			break;
 		}
 
@@ -543,12 +540,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 			ret = reserve_new_block(&dn);
 			if (ret) {
 				f2fs_put_dnode(&dn);
-				mutex_unlock_op(sbi, ilock);
+				f2fs_unlock_op(sbi);
 				break;
 			}
 		}
 		f2fs_put_dnode(&dn);
-		mutex_unlock_op(sbi, ilock);
+		f2fs_unlock_op(sbi);
 
 		if (pg_start == pg_end)
 			new_size = offset + len;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 9339cd292047..c8c058024af8 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -214,7 +214,7 @@ int update_inode_page(struct inode *inode)
 int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-	int ret, ilock;
+	int ret;
 
 	if (inode->i_ino == F2FS_NODE_INO(sbi) ||
 			inode->i_ino == F2FS_META_INO(sbi))
@@ -227,9 +227,9 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	 * We need to lock here to prevent from producing dirty node pages
 	 * during the urgent cleaning time when runing out of free sections.
 	 */
-	ilock = mutex_lock_op(sbi);
+	f2fs_lock_op(sbi);
 	ret = update_inode_page(inode);
-	mutex_unlock_op(sbi, ilock);
+	f2fs_unlock_op(sbi);
 
 	if (wbc)
 		f2fs_balance_fs(sbi);
@@ -243,7 +243,6 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
 void f2fs_evict_inode(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-	int ilock;
 
 	trace_f2fs_evict_inode(inode);
 	truncate_inode_pages(&inode->i_data, 0);
@@ -265,9 +264,9 @@ void f2fs_evict_inode(struct inode *inode)
 	if (F2FS_HAS_BLOCKS(inode))
 		f2fs_truncate(inode);
 
-	ilock = mutex_lock_op(sbi);
+	f2fs_lock_op(sbi);
 	remove_inode_page(inode);
-	mutex_unlock_op(sbi, ilock);
+	f2fs_unlock_op(sbi);
 
 	sb_end_intwrite(inode->i_sb);
 no_delete:
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 2a5359c990fc..29f73fdf958e 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -27,19 +27,19 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	nid_t ino;
 	struct inode *inode;
 	bool nid_free = false;
-	int err, ilock;
+	int err;
 
 	inode = new_inode(sb);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
-	ilock = mutex_lock_op(sbi);
+	f2fs_lock_op(sbi);
 	if (!alloc_nid(sbi, &ino)) {
-		mutex_unlock_op(sbi, ilock);
+		f2fs_unlock_op(sbi);
 		err = -ENOSPC;
 		goto fail;
 	}
-	mutex_unlock_op(sbi, ilock);
+	f2fs_unlock_op(sbi);
 
 	inode->i_uid = current_fsuid();
 
@@ -115,7 +115,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 	struct inode *inode;
 	nid_t ino = 0;
-	int err, ilock;
+	int err;
 
 	f2fs_balance_fs(sbi);
 
@@ -131,9 +131,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	inode->i_mapping->a_ops = &f2fs_dblock_aops;
 	ino = inode->i_ino;
 
-	ilock = mutex_lock_op(sbi);
+	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
-	mutex_unlock_op(sbi, ilock);
+	f2fs_unlock_op(sbi);
 	if (err)
 		goto out;
 
@@ -157,7 +157,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
 	struct inode *inode = old_dentry->d_inode;
 	struct super_block *sb = dir->i_sb;
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
-	int err, ilock;
+	int err;
 
 	f2fs_balance_fs(sbi);
 
@@ -165,9 +165,9 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
 	ihold(inode);
 
 	set_inode_flag(F2FS_I(inode), FI_INC_LINK);
-	ilock = mutex_lock_op(sbi);
+	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
-	mutex_unlock_op(sbi, ilock);
+	f2fs_unlock_op(sbi);
 	if (err)
 		goto out;
 
@@ -220,7 +220,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
 	struct f2fs_dir_entry *de;
 	struct page *page;
 	int err = -ENOENT;
-	int ilock;
 
 	trace_f2fs_unlink_enter(dir, dentry);
 	f2fs_balance_fs(sbi);
@@ -236,9 +235,9 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
 		goto fail;
 	}
 
-	ilock = mutex_lock_op(sbi);
+	f2fs_lock_op(sbi);
 	f2fs_delete_entry(de, page, inode);
-	mutex_unlock_op(sbi, ilock);
+	f2fs_unlock_op(sbi);
 
 	/* In order to evict this inode,  we set it dirty */
 	mark_inode_dirty(inode);
@@ -254,7 +253,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 	struct inode *inode;
 	size_t symlen = strlen(symname) + 1;
-	int err, ilock;
+	int err;
 
 	f2fs_balance_fs(sbi);
 
@@ -265,9 +264,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 	inode->i_op = &f2fs_symlink_inode_operations;
 	inode->i_mapping->a_ops = &f2fs_dblock_aops;
 
-	ilock = mutex_lock_op(sbi);
+	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
-	mutex_unlock_op(sbi, ilock);
+	f2fs_unlock_op(sbi);
 	if (err)
 		goto out;
 
@@ -290,7 +289,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
 	struct inode *inode;
-	int err, ilock;
+	int err;
 
 	f2fs_balance_fs(sbi);
 
@@ -304,9 +303,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
 
 	set_inode_flag(F2FS_I(inode), FI_INC_LINK);
-	ilock = mutex_lock_op(sbi);
+	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
-	mutex_unlock_op(sbi, ilock);
+	f2fs_unlock_op(sbi);
 	if (err)
 		goto out_fail;
 
@@ -342,7 +341,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 	struct inode *inode;
 	int err = 0;
-	int ilock;
 
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
@@ -356,9 +354,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
 	init_special_inode(inode, inode->i_mode, rdev);
 	inode->i_op = &f2fs_special_inode_operations;
 
-	ilock = mutex_lock_op(sbi);
+	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
-	mutex_unlock_op(sbi, ilock);
+	f2fs_unlock_op(sbi);
 	if (err)
 		goto out;
 
@@ -387,7 +385,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct f2fs_dir_entry *old_dir_entry = NULL;
 	struct f2fs_dir_entry *old_entry;
 	struct f2fs_dir_entry *new_entry;
-	int err = -ENOENT, ilock = -1;
+	int err = -ENOENT;
 
 	f2fs_balance_fs(sbi);
 
@@ -402,7 +400,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto out_old;
 	}
 
-	ilock = mutex_lock_op(sbi);
+	f2fs_lock_op(sbi);
 
 	if (new_inode) {
 
@@ -467,7 +465,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		update_inode_page(old_dir);
 	}
 
-	mutex_unlock_op(sbi, ilock);
+	f2fs_unlock_op(sbi);
 	return 0;
 
 put_out_dir:
@@ -477,7 +475,7 @@ out_dir:
 		kunmap(old_dir_page);
 		f2fs_put_page(old_dir_page, 0);
 	}
-	mutex_unlock_op(sbi, ilock);
+	f2fs_unlock_op(sbi);
 out_old:
 	kunmap(old_page);
 	f2fs_put_page(old_page, 0);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index a15d122fdc50..353cf4f66c7b 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -292,7 +292,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 	struct f2fs_summary sum;
 	struct node_info ni;
 	int err = 0, recovered = 0;
-	int ilock;
 
 	start = start_bidx_of_node(ofs_of_node(page), fi);
 	if (IS_INODE(page))
@@ -300,12 +299,12 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 	else
 		end = start + ADDRS_PER_BLOCK;
 
-	ilock = mutex_lock_op(sbi);
+	f2fs_lock_op(sbi);
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
 
 	err = get_dnode_of_data(&dn, start, ALLOC_NODE);
 	if (err) {
-		mutex_unlock_op(sbi, ilock);
+		f2fs_unlock_op(sbi);
 		return err;
 	}
 
@@ -356,7 +355,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 	recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
 err:
 	f2fs_put_dnode(&dn);
-	mutex_unlock_op(sbi, ilock);
+	f2fs_unlock_op(sbi);
 
 	f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, "
 			"recovered_data = %d blocks, err = %d",
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 13d0a0fe49dd..347d70049288 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -760,7 +760,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 	struct buffer_head *raw_super_buf;
 	struct inode *root;
 	long err = -EINVAL;
-	int i;
 
 	/* allocate memory for f2fs-specific super block info */
 	sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
@@ -818,12 +817,12 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 	mutex_init(&sbi->gc_mutex);
 	mutex_init(&sbi->writepages);
 	mutex_init(&sbi->cp_mutex);
-	for (i = 0; i < NR_GLOBAL_LOCKS; i++)
-		mutex_init(&sbi->fs_lock[i]);
 	mutex_init(&sbi->node_write);
 	sbi->por_doing = 0;
 	spin_lock_init(&sbi->stat_lock);
 	init_rwsem(&sbi->bio_sem);
+	init_rwsem(&sbi->cp_rwsem);
+	init_waitqueue_head(&sbi->cp_wait);
 	init_sb_info(sbi);
 
 	/* get an inode for meta space */
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 3d900ea34e47..f685138dd496 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -584,16 +584,13 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
 			const void *value, size_t value_len, struct page *ipage)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-	int ilock;
 	int err;
 
 	f2fs_balance_fs(sbi);
 
-	ilock = mutex_lock_op(sbi);
-
+	f2fs_lock_op(sbi);
 	err = __f2fs_setxattr(inode, name_index, name, value, value_len, ipage);
-
-	mutex_unlock_op(sbi, ilock);
+	f2fs_unlock_op(sbi);
 
 	return err;
 }
-- 
cgit v1.3


From 4058c5117d6d8a94002fb308615c7442ed3a3896 Mon Sep 17 00:00:00 2001
From: Kelly Anderson <kelly@xilka.com>
Date: Mon, 7 Oct 2013 11:36:20 +0900
Subject: f2fs: handle remount options correctly

The current f2fs code errors if the xattr or acl options are passed when
remounting.  This is important in a typical scenario where f2fs is mounted
as a "ro" root file-system by the boot loader and then the init process wants
to remount it "rw" with the "remount,rw" option.

Signed-off-by: Kelly Anderson <kelly@xilka.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/super.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 347d70049288..fde8e6aca9be 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -43,7 +43,9 @@ enum {
 	Opt_disable_roll_forward,
 	Opt_discard,
 	Opt_noheap,
+	Opt_user_xattr,
 	Opt_nouser_xattr,
+	Opt_acl,
 	Opt_noacl,
 	Opt_active_logs,
 	Opt_disable_ext_identify,
@@ -56,7 +58,9 @@ static match_table_t f2fs_tokens = {
 	{Opt_disable_roll_forward, "disable_roll_forward"},
 	{Opt_discard, "discard"},
 	{Opt_noheap, "no_heap"},
+	{Opt_user_xattr, "user_xattr"},
 	{Opt_nouser_xattr, "nouser_xattr"},
+	{Opt_acl, "acl"},
 	{Opt_noacl, "noacl"},
 	{Opt_active_logs, "active_logs=%u"},
 	{Opt_disable_ext_identify, "disable_ext_identify"},
@@ -237,6 +241,9 @@ static int parse_options(struct super_block *sb, char *options)
 			set_opt(sbi, NOHEAP);
 			break;
 #ifdef CONFIG_F2FS_FS_XATTR
+		case Opt_user_xattr:
+			set_opt(sbi, XATTR_USER);
+			break;
 		case Opt_nouser_xattr:
 			clear_opt(sbi, XATTR_USER);
 			break;
@@ -244,6 +251,10 @@ static int parse_options(struct super_block *sb, char *options)
 			set_opt(sbi, INLINE_XATTR);
 			break;
 #else
+		case Opt_user_xattr:
+			f2fs_msg(sb, KERN_INFO,
+				"user_xattr options not supported");
+			break;
 		case Opt_nouser_xattr:
 			f2fs_msg(sb, KERN_INFO,
 				"nouser_xattr options not supported");
@@ -254,10 +265,16 @@ static int parse_options(struct super_block *sb, char *options)
 			break;
 #endif
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
+		case Opt_acl:
+			set_opt(sbi, POSIX_ACL);
+			break;
 		case Opt_noacl:
 			clear_opt(sbi, POSIX_ACL);
 			break;
 #else
+		case Opt_acl:
+			f2fs_msg(sb, KERN_INFO, "acl options not supported");
+			break;
 		case Opt_noacl:
 			f2fs_msg(sb, KERN_INFO, "noacl options not supported");
 			break;
-- 
cgit v1.3


From 5887d291d792773368f6eaf1759aad109bcd78eb Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Tue, 8 Oct 2013 09:30:33 +0900
Subject: f2fs: avoid unnecessary checkpoints

During the f2fs_put_super procedure, we don't need to conduct checkpoint all
the time, since we don't need to do that if superblock is clean.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index fde8e6aca9be..539c009d4138 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -372,7 +372,9 @@ static void f2fs_put_super(struct super_block *sb)
 	f2fs_destroy_stats(sbi);
 	stop_gc_thread(sbi);
 
-	write_checkpoint(sbi, true);
+	/* We don't need to do checkpoint when it's clean */
+	if (sbi->s_dirty && get_pages(sbi, F2FS_DIRTY_NODES))
+		write_checkpoint(sbi, true);
 
 	iput(sbi->node_inode);
 	iput(sbi->meta_inode);
-- 
cgit v1.3


From ccaaca25919a2cdeccd8fdce5f546e3ed7f6a9e9 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Tue, 8 Oct 2013 10:19:28 +0900
Subject: f2fs: fix writing incorrect orphan blocks

Previously, there was a erroneous scenario like below.
thread 1:                       thread 2:
 f2fs_unlink
  - acquire_orphan_inode
    : sbi->n_orphans++           write_checkpoint
                                 - block_operations
                                  : f2fs_lock_all
                                 - do_checkpoint
                                  : write orphan blocks with sbi->n_orphans
                                 - unblock_operations
  - f2fs_lock_op
  - release_orphan_inode
  - f2fs_unlock_op

During the checkpoint by thread 2, f2fs stores a wrong orphan block according
to the wrong sbi->n_orphans.
To avoid this, simply we should make cover acquire_orphan_inode too with
f2fs_lock_op.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/namei.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 29f73fdf958e..575adac17f8b 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -228,14 +228,14 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
 	if (!de)
 		goto fail;
 
+	f2fs_lock_op(sbi);
 	err = acquire_orphan_inode(sbi);
 	if (err) {
+		f2fs_unlock_op(sbi);
 		kunmap(page);
 		f2fs_put_page(page, 0);
 		goto fail;
 	}
-
-	f2fs_lock_op(sbi);
 	f2fs_delete_entry(de, page, inode);
 	f2fs_unlock_op(sbi);
 
-- 
cgit v1.3


From 2900a579abcea13e4e77c40f89eb16eeecb5508e Mon Sep 17 00:00:00 2001
From: Mark Tinguely <tinguely@sgi.com>
Date: Wed, 2 Oct 2013 16:00:06 -0500
Subject: xfs: add the inode directory type support to XFS_IOC_FSGEOM

Add the inode type directory type support to XFS_IOC_FSGEOM
so that xfs_repair/xfs_info knows if the superblock v4 filesystem
enabled the feature.

Signed-off-by: Mark Tinguely <tinguely@sgi.com>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_fs.h    | 4 ++--
 fs/xfs/xfs_fsops.c | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 18272c766a50..c5fc116dfaa3 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -233,11 +233,11 @@ typedef struct xfs_fsop_resblks {
 #define XFS_FSOP_GEOM_FLAGS_LOGV2	0x0100	/* log format version 2	*/
 #define XFS_FSOP_GEOM_FLAGS_SECTOR	0x0200	/* sector sizes >1BB	*/
 #define XFS_FSOP_GEOM_FLAGS_ATTR2	0x0400	/* inline attributes rework */
-#define XFS_FSOP_GEOM_FLAGS_PROJID32	0x0800  /* 32-bit project IDs	*/
+#define XFS_FSOP_GEOM_FLAGS_PROJID32	0x0800	/* 32-bit project IDs	*/
 #define XFS_FSOP_GEOM_FLAGS_DIRV2CI	0x1000	/* ASCII only CI names	*/
 #define XFS_FSOP_GEOM_FLAGS_LAZYSB	0x4000	/* lazy superblock counters */
 #define XFS_FSOP_GEOM_FLAGS_V5SB	0x8000	/* version 5 superblock */
-
+#define XFS_FSOP_GEOM_FLAGS_FTYPE	0x10000	/* inode directory types */
 
 /*
  * Minimum and maximum sizes need for growth checks.
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index e64ee5288b86..fdae4ec5f21b 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -101,7 +101,9 @@ xfs_fs_geometry(
 			(xfs_sb_version_hasprojid32bit(&mp->m_sb) ?
 				XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) |
 			(xfs_sb_version_hascrc(&mp->m_sb) ?
-				XFS_FSOP_GEOM_FLAGS_V5SB : 0);
+				XFS_FSOP_GEOM_FLAGS_V5SB : 0) |
+			(xfs_sb_version_hasftype(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_FTYPE : 0);
 		geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
 				mp->m_sb.sb_logsectsize : BBSIZE;
 		geo->rtsectsize = mp->m_sb.sb_blocksize;
-- 
cgit v1.3


From 36b21dde6e899d2f25077b7c239d2d6bcff17d73 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Fri, 20 Sep 2013 11:06:09 -0400
Subject: xfs: push down inactive transaction mgmt for remote symlinks

Push down the transaction management for remote symlinks from
xfs_inactive() down to xfs_inactive_symlink_rmt(). The latter is
cleaned up to avoid transaction management intended for the
calling context (i.e., trans duplication, reservation, item
attachment).

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_inode.c   | 15 ++++-----
 fs/xfs/xfs_symlink.c | 86 ++++++++++++++++++++++++----------------------------
 fs/xfs/xfs_symlink.h |  2 +-
 3 files changed, 49 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 5ac716b2df22..c26990b50941 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1724,9 +1724,14 @@ xfs_inactive(
 	if (error)
 		return VN_INACTIVE_CACHE;
 
+	if (S_ISLNK(ip->i_d.di_mode)) {
+		error = xfs_inactive_symlink(ip);
+		if (error)
+			goto out;
+	}
+
 	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-	resp = (truncate || S_ISLNK(ip->i_d.di_mode)) ?
-		&M_RES(mp)->tr_itruncate : &M_RES(mp)->tr_ifree;
+	resp = truncate ? &M_RES(mp)->tr_itruncate : &M_RES(mp)->tr_ifree;
 
 	error = xfs_trans_reserve(tp, resp, 0, 0);
 	if (error) {
@@ -1738,11 +1743,7 @@ xfs_inactive(
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, 0);
 
-	if (S_ISLNK(ip->i_d.di_mode)) {
-		error = xfs_inactive_symlink(ip, &tp);
-		if (error)
-			goto out_cancel;
-	} else if (truncate) {
+	if (truncate) {
 		ip->i_d.di_size = 0;
 		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index f622a97a7e33..ded282b10bc2 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -424,8 +424,7 @@ xfs_symlink(
  */
 STATIC int
 xfs_inactive_symlink_rmt(
-	xfs_inode_t	*ip,
-	xfs_trans_t	**tpp)
+	struct xfs_inode *ip)
 {
 	xfs_buf_t	*bp;
 	int		committed;
@@ -437,11 +436,9 @@ xfs_inactive_symlink_rmt(
 	xfs_mount_t	*mp;
 	xfs_bmbt_irec_t	mval[XFS_SYMLINK_MAPS];
 	int		nmaps;
-	xfs_trans_t	*ntp;
 	int		size;
 	xfs_trans_t	*tp;
 
-	tp = *tpp;
 	mp = ip->i_mount;
 	ASSERT(ip->i_df.if_flags & XFS_IFEXTENTS);
 	/*
@@ -453,6 +450,16 @@ xfs_inactive_symlink_rmt(
 	 */
 	ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
 
+	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
 	/*
 	 * Lock the inode, fix the size, and join it to the transaction.
 	 * Hold it so in the normal path, we still have it locked for
@@ -471,7 +478,7 @@ xfs_inactive_symlink_rmt(
 	error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size),
 				mval, &nmaps, 0);
 	if (error)
-		goto error0;
+		goto error_trans_cancel;
 	/*
 	 * Invalidate the block(s). No validation is done.
 	 */
@@ -481,22 +488,24 @@ xfs_inactive_symlink_rmt(
 			XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
 		if (!bp) {
 			error = ENOMEM;
-			goto error1;
+			goto error_bmap_cancel;
 		}
 		xfs_trans_binval(tp, bp);
 	}
 	/*
 	 * Unmap the dead block(s) to the free_list.
 	 */
-	if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
-			&first_block, &free_list, &done)))
-		goto error1;
+	error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
+			    &first_block, &free_list, &done);
+	if (error)
+		goto error_bmap_cancel;
 	ASSERT(done);
 	/*
 	 * Commit the first transaction.  This logs the EFI and the inode.
 	 */
-	if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
-		goto error1;
+	error = xfs_bmap_finish(&tp, &free_list, &committed);
+	if (error)
+		goto error_bmap_cancel;
 	/*
 	 * The transaction must have been committed, since there were
 	 * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
@@ -510,27 +519,14 @@ xfs_inactive_symlink_rmt(
 	 */
 	xfs_trans_ijoin(tp, ip, 0);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	/*
-	 * Get a new, empty transaction to return to our caller.
-	 */
-	ntp = xfs_trans_dup(tp);
 	/*
 	 * Commit the transaction containing extent freeing and EFDs.
-	 * If we get an error on the commit here or on the reserve below,
-	 * we need to unlock the inode since the new transaction doesn't
-	 * have the inode attached.
 	 */
-	error = xfs_trans_commit(tp, 0);
-	tp = ntp;
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 	if (error) {
 		ASSERT(XFS_FORCED_SHUTDOWN(mp));
-		goto error0;
+		goto error_unlock;
 	}
-	/*
-	 * transaction commit worked ok so we can drop the extra ticket
-	 * reference that we gained in xfs_trans_dup()
-	 */
-	xfs_log_ticket_put(tp->t_ticket);
 
 	/*
 	 * Remove the memory for extent descriptions (just bookkeeping).
@@ -538,23 +534,16 @@ xfs_inactive_symlink_rmt(
 	if (ip->i_df.if_bytes)
 		xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
 	ASSERT(ip->i_df.if_bytes == 0);
-	/*
-	 * Put an itruncate log reservation in the new transaction
-	 * for our caller.
-	 */
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
-	if (error) {
-		ASSERT(XFS_FORCED_SHUTDOWN(mp));
-		goto error0;
-	}
 
-	xfs_trans_ijoin(tp, ip, 0);
-	*tpp = tp;
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return 0;
 
- error1:
+error_bmap_cancel:
 	xfs_bmap_cancel(&free_list);
- error0:
+error_trans_cancel:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+error_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 }
 
@@ -563,41 +552,46 @@ xfs_inactive_symlink_rmt(
  */
 int
 xfs_inactive_symlink(
-	struct xfs_inode	*ip,
-	struct xfs_trans	**tp)
+	struct xfs_inode	*ip)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	int			pathlen;
 
 	trace_xfs_inactive_symlink(ip);
 
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
 
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
 	/*
 	 * Zero length symlinks _can_ exist.
 	 */
 	pathlen = (int)ip->i_d.di_size;
-	if (!pathlen)
+	if (!pathlen) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		return 0;
+	}
 
 	if (pathlen < 0 || pathlen > MAXPATHLEN) {
 		xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)",
 			 __func__, (unsigned long long)ip->i_ino, pathlen);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		ASSERT(0);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 
 	if (ip->i_df.if_flags & XFS_IFINLINE) {
-		if (ip->i_df.if_bytes > 0)
+		if (ip->i_df.if_bytes > 0) 
 			xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
 					  XFS_DATA_FORK);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		ASSERT(ip->i_df.if_bytes == 0);
 		return 0;
 	}
 
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
 	/* remove the remote symlink */
-	return xfs_inactive_symlink_rmt(ip, tp);
+	return xfs_inactive_symlink_rmt(ip);
 }
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
index 99338ba666ac..e75245d09116 100644
--- a/fs/xfs/xfs_symlink.h
+++ b/fs/xfs/xfs_symlink.h
@@ -22,6 +22,6 @@
 int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
 		const char *target_path, umode_t mode, struct xfs_inode **ipp);
 int xfs_readlink(struct xfs_inode *ip, char *link);
-int xfs_inactive_symlink(struct xfs_inode *ip, struct xfs_trans **tpp);
+int xfs_inactive_symlink(struct xfs_inode *ip);
 
 #endif /* __XFS_SYMLINK_H */
-- 
cgit v1.3


From f7be2d7f594cbc7a00902b5427332a1ad519a528 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Fri, 20 Sep 2013 11:06:10 -0400
Subject: xfs: push down inactive transaction mgmt for truncate

Create the new xfs_inactive_truncate() function to handle the
truncate portion of xfs_inactive(). Push the locking and
transaction management into the new function.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_inode.c | 117 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 68 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c26990b50941..7b86f643046b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1662,6 +1662,58 @@ xfs_release(
 	return 0;
 }
 
+/*
+ * xfs_inactive_truncate
+ *
+ * Called to perform a truncate when an inode becomes unlinked.
+ */
+STATIC int
+xfs_inactive_truncate(
+	struct xfs_inode *ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+	if (error) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	/*
+	 * Log the inode size first to prevent stale data exposure in the event
+	 * of a system crash before the truncate completes. See the related
+	 * comment in xfs_setattr_size() for details.
+	 */
+	ip->i_d.di_size = 0;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
+	if (error)
+		goto error_trans_cancel;
+
+	ASSERT(ip->i_d.di_nextents == 0);
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	if (error)
+		goto error_unlock;
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return 0;
+
+error_trans_cancel:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+error_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
 /*
  * xfs_inactive
  *
@@ -1679,7 +1731,6 @@ xfs_inactive(
 	int			committed;
 	struct xfs_trans	*tp;
 	struct xfs_mount	*mp;
-	struct xfs_trans_res	*resp;
 	int			error;
 	int			truncate = 0;
 
@@ -1724,35 +1775,12 @@ xfs_inactive(
 	if (error)
 		return VN_INACTIVE_CACHE;
 
-	if (S_ISLNK(ip->i_d.di_mode)) {
+	if (S_ISLNK(ip->i_d.di_mode))
 		error = xfs_inactive_symlink(ip);
-		if (error)
-			goto out;
-	}
-
-	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-	resp = truncate ? &M_RES(mp)->tr_itruncate : &M_RES(mp)->tr_ifree;
-
-	error = xfs_trans_reserve(tp, resp, 0, 0);
-	if (error) {
-		ASSERT(XFS_FORCED_SHUTDOWN(mp));
-		xfs_trans_cancel(tp, 0);
-		return VN_INACTIVE_CACHE;
-	}
-
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin(tp, ip, 0);
-
-	if (truncate) {
-		ip->i_d.di_size = 0;
-		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
-		if (error)
-			goto out_cancel;
-
-		ASSERT(ip->i_d.di_nextents == 0);
-	}
+	else if (truncate)
+		error = xfs_inactive_truncate(ip);
+	if (error)
+		goto out;
 
 	/*
 	 * If there are attributes associated with the file then blow them away
@@ -1763,25 +1791,9 @@ xfs_inactive(
 	if (ip->i_d.di_anextents > 0) {
 		ASSERT(ip->i_d.di_forkoff != 0);
 
-		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-		if (error)
-			goto out_unlock;
-
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
 		error = xfs_attr_inactive(ip);
 		if (error)
 			goto out;
-
-		tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, 0, 0);
-		if (error) {
-			xfs_trans_cancel(tp, 0);
-			goto out;
-		}
-
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		xfs_trans_ijoin(tp, ip, 0);
 	}
 
 	if (ip->i_afp)
@@ -1789,6 +1801,17 @@ xfs_inactive(
 
 	ASSERT(ip->i_d.di_anextents == 0);
 
+	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, 0, 0);
+	if (error) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
+		goto out;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
 	/*
 	 * Free the inode.
 	 */
@@ -1831,13 +1854,9 @@ xfs_inactive(
 	 * Release the dquots held by inode, if any.
 	 */
 	xfs_qm_dqdetach(ip);
-out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 out:
 	return VN_INACTIVE_CACHE;
-out_cancel:
-	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-	goto out_unlock;
 }
 
 /*
-- 
cgit v1.3


From 88877d2b9727a14431bfe48216ff86331ab47ea5 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Fri, 20 Sep 2013 11:06:11 -0400
Subject: xfs: push down inactive transaction mgmt for ifree

Push the inode free work performed during xfs_inactive() down into
a new xfs_inactive_ifree() helper. This clears xfs_inactive() from
all inode locking and transaction management more directly
associated with freeing the inode xattrs, extents and the inode
itself.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_inode.c | 121 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 71 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7b86f643046b..223d1a163b2d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1714,6 +1714,74 @@ error_unlock:
 	return error;
 }
 
+/*
+ * xfs_inactive_ifree()
+ *
+ * Perform the inode free when an inode is unlinked.
+ */
+STATIC int
+xfs_inactive_ifree(
+	struct xfs_inode *ip)
+{
+	xfs_bmap_free_t		free_list;
+	xfs_fsblock_t		first_block;
+	int			committed;
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, 0, 0);
+	if (error) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	xfs_bmap_init(&free_list, &first_block);
+	error = xfs_ifree(tp, ip, &free_list);
+	if (error) {
+		/*
+		 * If we fail to free the inode, shut down.  The cancel
+		 * might do that, we need to make sure.  Otherwise the
+		 * inode might be lost for a long time or forever.
+		 */
+		if (!XFS_FORCED_SHUTDOWN(mp)) {
+			xfs_notice(mp, "%s: xfs_ifree returned error %d",
+				__func__, error);
+			xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+		}
+		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		return error;
+	}
+
+	/*
+	 * Credit the quota account(s). The inode is gone.
+	 */
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
+
+	/*
+	 * Just ignore errors at this point.  There is nothing we can
+	 * do except to try to keep going. Make sure it's not a silent
+	 * error.
+	 */
+	error = xfs_bmap_finish(&tp,  &free_list, &committed);
+	if (error)
+		xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
+			__func__, error);
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	if (error)
+		xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
+			__func__, error);
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return 0;
+}
+
 /*
  * xfs_inactive
  *
@@ -1726,10 +1794,6 @@ int
 xfs_inactive(
 	xfs_inode_t	*ip)
 {
-	xfs_bmap_free_t		free_list;
-	xfs_fsblock_t		first_block;
-	int			committed;
-	struct xfs_trans	*tp;
 	struct xfs_mount	*mp;
 	int			error;
 	int			truncate = 0;
@@ -1801,60 +1865,17 @@ xfs_inactive(
 
 	ASSERT(ip->i_d.di_anextents == 0);
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, 0, 0);
-	if (error) {
-		ASSERT(XFS_FORCED_SHUTDOWN(mp));
-		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
-		goto out;
-	}
-
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin(tp, ip, 0);
-
 	/*
 	 * Free the inode.
 	 */
-	xfs_bmap_init(&free_list, &first_block);
-	error = xfs_ifree(tp, ip, &free_list);
-	if (error) {
-		/*
-		 * If we fail to free the inode, shut down.  The cancel
-		 * might do that, we need to make sure.  Otherwise the
-		 * inode might be lost for a long time or forever.
-		 */
-		if (!XFS_FORCED_SHUTDOWN(mp)) {
-			xfs_notice(mp, "%s: xfs_ifree returned error %d",
-				__func__, error);
-			xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-		}
-		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
-	} else {
-		/*
-		 * Credit the quota account(s). The inode is gone.
-		 */
-		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
-
-		/*
-		 * Just ignore errors at this point.  There is nothing we can
-		 * do except to try to keep going. Make sure it's not a silent
-		 * error.
-		 */
-		error = xfs_bmap_finish(&tp,  &free_list, &committed);
-		if (error)
-			xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
-				__func__, error);
-		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-		if (error)
-			xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
-				__func__, error);
-	}
+	error = xfs_inactive_ifree(ip);
+	if (error)
+		goto out;
 
 	/*
 	 * Release the dquots held by inode, if any.
 	 */
 	xfs_qm_dqdetach(ip);
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 out:
 	return VN_INACTIVE_CACHE;
 }
-- 
cgit v1.3


From 74564fb48cbfcb5b433c1baec1f3158ea638b203 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Fri, 20 Sep 2013 11:06:12 -0400
Subject: xfs: clean up xfs_inactive() error handling, kill
 VN_INACTIVE_[NO]CACHE

The xfs_inactive() return value is meaningless. Turn xfs_inactive()
into a void function and clean up the error handling appropriately.
Kill the VN_INACTIVE_[NO]CACHE directives as they are not relevant
to Linux.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_inode.c | 28 +++++++++++-----------------
 fs/xfs/xfs_inode.h |  2 +-
 fs/xfs/xfs_vnode.h |  8 --------
 3 files changed, 12 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 223d1a163b2d..5fce7261a64b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1790,7 +1790,7 @@ xfs_inactive_ifree(
  * now be truncated.  Also, we clear all of the read-ahead state
  * kept for the inode here since the file is now closed.
  */
-int
+void
 xfs_inactive(
 	xfs_inode_t	*ip)
 {
@@ -1805,16 +1805,14 @@ xfs_inactive(
 	if (ip->i_d.di_mode == 0) {
 		ASSERT(ip->i_df.if_real_bytes == 0);
 		ASSERT(ip->i_df.if_broot_bytes == 0);
-		return VN_INACTIVE_CACHE;
+		return;
 	}
 
 	mp = ip->i_mount;
 
-	error = 0;
-
 	/* If this is a read-only mount, don't do this (would generate I/O) */
 	if (mp->m_flags & XFS_MOUNT_RDONLY)
-		goto out;
+		return;
 
 	if (ip->i_d.di_nlink != 0) {
 		/*
@@ -1822,12 +1820,10 @@ xfs_inactive(
 		 * cache. Post-eof blocks must be freed, lest we end up with
 		 * broken free space accounting.
 		 */
-		if (xfs_can_free_eofblocks(ip, true)) {
-			error = xfs_free_eofblocks(mp, ip, false);
-			if (error)
-				return VN_INACTIVE_CACHE;
-		}
-		goto out;
+		if (xfs_can_free_eofblocks(ip, true))
+			xfs_free_eofblocks(mp, ip, false);
+
+		return;
 	}
 
 	if (S_ISREG(ip->i_d.di_mode) &&
@@ -1837,14 +1833,14 @@ xfs_inactive(
 
 	error = xfs_qm_dqattach(ip, 0);
 	if (error)
-		return VN_INACTIVE_CACHE;
+		return;
 
 	if (S_ISLNK(ip->i_d.di_mode))
 		error = xfs_inactive_symlink(ip);
 	else if (truncate)
 		error = xfs_inactive_truncate(ip);
 	if (error)
-		goto out;
+		return;
 
 	/*
 	 * If there are attributes associated with the file then blow them away
@@ -1857,7 +1853,7 @@ xfs_inactive(
 
 		error = xfs_attr_inactive(ip);
 		if (error)
-			goto out;
+			return;
 	}
 
 	if (ip->i_afp)
@@ -1870,14 +1866,12 @@ xfs_inactive(
 	 */
 	error = xfs_inactive_ifree(ip);
 	if (error)
-		goto out;
+		return;
 
 	/*
 	 * Release the dquots held by inode, if any.
 	 */
 	xfs_qm_dqdetach(ip);
-out:
-	return VN_INACTIVE_CACHE;
 }
 
 /*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 4a91358c1470..cce62ce1a73a 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -316,7 +316,7 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
 
 
 int		xfs_release(struct xfs_inode *ip);
-int		xfs_inactive(struct xfs_inode *ip);
+void		xfs_inactive(struct xfs_inode *ip);
 int		xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
 			   struct xfs_inode **ipp, struct xfs_name *ci_name);
 int		xfs_create(struct xfs_inode *dp, struct xfs_name *name,
diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h
index db14d0c08682..3e8e797c6d11 100644
--- a/fs/xfs/xfs_vnode.h
+++ b/fs/xfs/xfs_vnode.h
@@ -24,14 +24,6 @@ struct file;
 struct xfs_inode;
 struct attrlist_cursor_kern;
 
-/*
- * Return values for xfs_inactive.  A return value of
- * VN_INACTIVE_NOCACHE implies that the file system behavior
- * has disassociated its state and bhv_desc_t from the vnode.
- */
-#define	VN_INACTIVE_CACHE	0
-#define	VN_INACTIVE_NOCACHE	1
-
 /*
  * Flags for read/write calls - same values as IRIX
  */
-- 
cgit v1.3


From e29cf08b05dc0b8151d65704d96d525a9e179a6b Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 7 Oct 2013 11:29:22 +0100
Subject: sched/numa: Report a NUMA task group ID

It is desirable to model from userspace how the scheduler groups tasks
over time. This patch adds an ID to the numa_group and reports it via
/proc/PID/status.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1381141781-10992-45-git-send-email-mgorman@suse.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/proc/array.c       | 2 ++
 include/linux/sched.h | 5 +++++
 kernel/sched/fair.c   | 7 +++++++
 3 files changed, 14 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index cbd0f1b324b9..1bd2077187fd 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -183,6 +183,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 	seq_printf(m,
 		"State:\t%s\n"
 		"Tgid:\t%d\n"
+		"Ngid:\t%d\n"
 		"Pid:\t%d\n"
 		"PPid:\t%d\n"
 		"TracerPid:\t%d\n"
@@ -190,6 +191,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 		"Gid:\t%d\t%d\t%d\t%d\n",
 		get_task_state(p),
 		task_tgid_nr_ns(p, ns),
+		task_numa_group_id(p),
 		pid_nr_ns(pid, ns),
 		ppid, tpid,
 		from_kuid_munged(user_ns, cred->uid),
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f587ded5c148..b0b343b1ba64 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1452,12 +1452,17 @@ struct task_struct {
 
 #ifdef CONFIG_NUMA_BALANCING
 extern void task_numa_fault(int last_node, int node, int pages, bool migrated);
+extern pid_t task_numa_group_id(struct task_struct *p);
 extern void set_numabalancing_state(bool enabled);
 #else
 static inline void task_numa_fault(int last_node, int node, int pages,
 				   bool migrated)
 {
 }
+static inline pid_t task_numa_group_id(struct task_struct *p)
+{
+	return 0;
+}
 static inline void set_numabalancing_state(bool enabled)
 {
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 85565053a6ed..5bd309c035c7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -893,12 +893,18 @@ struct numa_group {
 
 	spinlock_t lock; /* nr_tasks, tasks */
 	int nr_tasks;
+	pid_t gid;
 	struct list_head task_list;
 
 	struct rcu_head rcu;
 	atomic_long_t faults[0];
 };
 
+pid_t task_numa_group_id(struct task_struct *p)
+{
+	return p->numa_group ? p->numa_group->gid : 0;
+}
+
 static inline int task_faults_idx(int nid, int priv)
 {
 	return 2 * nid + priv;
@@ -1265,6 +1271,7 @@ static void task_numa_group(struct task_struct *p, int cpupid)
 		atomic_set(&grp->refcount, 1);
 		spin_lock_init(&grp->lock);
 		INIT_LIST_HEAD(&grp->task_list);
+		grp->gid = p->pid;
 
 		for (i = 0; i < 2*nr_node_ids; i++)
 			atomic_long_set(&grp->faults[i], p->numa_faults[i]);
-- 
cgit v1.3


From 82727018b0d33d188e9916bcf76f18387484cb04 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 7 Oct 2013 11:29:28 +0100
Subject: sched/numa: Call task_numa_free() from do_execve()

It is possible for a task in a numa group to call exec, and
have the new (unrelated) executable inherit the numa group
association from its former self.

This has the potential to break numa grouping, and is trivial
to fix.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1381141781-10992-51-git-send-email-mgorman@suse.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/exec.c             | 1 +
 include/linux/sched.h | 4 ++++
 kernel/sched/fair.c   | 9 ++++++++-
 kernel/sched/sched.h  | 5 -----
 4 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 8875dd10ae7a..2ea437e5acf4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1547,6 +1547,7 @@ static int do_execve_common(const char *filename,
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
 	acct_update_integrals(current);
+	task_numa_free(current);
 	free_bprm(bprm);
 	if (displaced)
 		put_files_struct(displaced);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 724482200b83..f6385107c352 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1458,6 +1458,7 @@ struct task_struct {
 extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
 extern void set_numabalancing_state(bool enabled);
+extern void task_numa_free(struct task_struct *p);
 #else
 static inline void task_numa_fault(int last_node, int node, int pages,
 				   int flags)
@@ -1470,6 +1471,9 @@ static inline pid_t task_numa_group_id(struct task_struct *p)
 static inline void set_numabalancing_state(bool enabled)
 {
 }
+static inline void task_numa_free(struct task_struct *p)
+{
+}
 #endif
 
 static inline struct pid *task_pid(struct task_struct *task)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4c40e13310e9..c4df2de6ca4a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1418,6 +1418,7 @@ void task_numa_free(struct task_struct *p)
 {
 	struct numa_group *grp = p->numa_group;
 	int i;
+	void *numa_faults = p->numa_faults;
 
 	if (grp) {
 		for (i = 0; i < 2*nr_node_ids; i++)
@@ -1433,7 +1434,9 @@ void task_numa_free(struct task_struct *p)
 		put_numa_group(grp);
 	}
 
-	kfree(p->numa_faults);
+	p->numa_faults = NULL;
+	p->numa_faults_buffer = NULL;
+	kfree(numa_faults);
 }
 
 /*
@@ -1452,6 +1455,10 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
 	if (!p->mm)
 		return;
 
+	/* Do not worry about placement if exiting */
+	if (p->state == TASK_DEAD)
+		return;
+
 	/* Allocate buffer to track faults on a per-node basis */
 	if (unlikely(!p->numa_faults)) {
 		int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8037b10a256f..eeb1923812a1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -559,11 +559,6 @@ static inline u64 rq_clock_task(struct rq *rq)
 #ifdef CONFIG_NUMA_BALANCING
 extern int migrate_task_to(struct task_struct *p, int cpu);
 extern int migrate_swap(struct task_struct *, struct task_struct *);
-extern void task_numa_free(struct task_struct *p);
-#else /* CONFIG_NUMA_BALANCING */
-static inline void task_numa_free(struct task_struct *p)
-{
-}
 #endif /* CONFIG_NUMA_BALANCING */
 
 #ifdef CONFIG_SMP
-- 
cgit v1.3


From 32df8dca503f82c816f8be85a8d0a394a8b88c2c Mon Sep 17 00:00:00 2001
From: Rob Herring <rob.herring@calxeda.com>
Date: Sat, 7 Sep 2013 14:11:58 -0500
Subject: of: remove HAVE_ARCH_DEVTREE_FIXUPS

HAVE_ARCH_DEVTREE_FIXUPS appears to always be needed except for sparc,
but it is only used for /proc/device-teee and sparc does not enable
/proc/device-tree. So this option is redundant. Remove the option and
always enable it. This has the side effect of fixing /proc/device-tree
on arches such as arm64 which failed to define this option.

Signed-off-by: Rob Herring <rob.herring@calxeda.com>
Acked-by: Vineet Gupta <vgupta@synopsys.com>
Acked-by: Grant Likely <grant.likely@linaro.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Cc: Chris Zankel <chris@zankel.net>
Cc: Max Filippov <jcmvbkbc@gmail.com>
---
 arch/arc/include/asm/prom.h        | 2 --
 arch/arm/include/asm/prom.h        | 2 --
 arch/metag/include/asm/prom.h      | 1 -
 arch/microblaze/include/asm/prom.h | 2 --
 arch/openrisc/include/asm/prom.h   | 2 --
 arch/powerpc/include/asm/prom.h    | 2 --
 arch/x86/include/asm/prom.h        | 2 --
 arch/xtensa/include/asm/prom.h     | 2 --
 fs/proc/proc_devtree.c             | 3 ---
 9 files changed, 18 deletions(-)

(limited to 'fs')

diff --git a/arch/arc/include/asm/prom.h b/arch/arc/include/asm/prom.h
index 692d0d0789a7..15555499e430 100644
--- a/arch/arc/include/asm/prom.h
+++ b/arch/arc/include/asm/prom.h
@@ -9,6 +9,4 @@
 #ifndef _ASM_ARC_PROM_H_
 #define _ASM_ARC_PROM_H_
 
-#define HAVE_ARCH_DEVTREE_FIXUPS
-
 #endif
diff --git a/arch/arm/include/asm/prom.h b/arch/arm/include/asm/prom.h
index 4a2985e21969..b681575ad3de 100644
--- a/arch/arm/include/asm/prom.h
+++ b/arch/arm/include/asm/prom.h
@@ -11,8 +11,6 @@
 #ifndef __ASMARM_PROM_H
 #define __ASMARM_PROM_H
 
-#define HAVE_ARCH_DEVTREE_FIXUPS
-
 #ifdef CONFIG_OF
 
 extern const struct machine_desc *setup_machine_fdt(unsigned int dt_phys);
diff --git a/arch/metag/include/asm/prom.h b/arch/metag/include/asm/prom.h
index e19022c5a0cb..d4be144a8613 100644
--- a/arch/metag/include/asm/prom.h
+++ b/arch/metag/include/asm/prom.h
@@ -14,7 +14,6 @@
 #ifndef __ASM_METAG_PROM_H
 #define __ASM_METAG_PROM_H
 
-#define HAVE_ARCH_DEVTREE_FIXUPS
 
 extern void copy_fdt(void);
 
diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index 0ebd924902df..2f03ac815851 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -16,8 +16,6 @@
 
 #include <linux/of.h>
 
-#define HAVE_ARCH_DEVTREE_FIXUPS
-
 /* Other Prototypes */
 enum early_consoles {
 	UARTLITE = 1,
diff --git a/arch/openrisc/include/asm/prom.h b/arch/openrisc/include/asm/prom.h
index 93c9980e1b6b..bec477ba9f7d 100644
--- a/arch/openrisc/include/asm/prom.h
+++ b/arch/openrisc/include/asm/prom.h
@@ -17,6 +17,4 @@
 #ifndef _ASM_OPENRISC_PROM_H
 #define _ASM_OPENRISC_PROM_H
 
-#define HAVE_ARCH_DEVTREE_FIXUPS
-
 #endif /* _ASM_OPENRISC_PROM_H */
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 6707c16d8fc5..43fe0023d722 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -20,8 +20,6 @@
 #include <asm/irq.h>
 #include <linux/atomic.h>
 
-#define HAVE_ARCH_DEVTREE_FIXUPS
-
 /*
  * OF address retreival & translation
  */
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index 8ef2ec70858f..fbeb06ed0eaa 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -39,7 +39,5 @@ static inline void x86_dtb_init(void) { }
 
 extern char cmd_line[COMMAND_LINE_SIZE];
 
-#define HAVE_ARCH_DEVTREE_FIXUPS
-
 #endif /* __ASSEMBLY__ */
 #endif
diff --git a/arch/xtensa/include/asm/prom.h b/arch/xtensa/include/asm/prom.h
index f3d7cd2c0de7..2a87a583b41b 100644
--- a/arch/xtensa/include/asm/prom.h
+++ b/arch/xtensa/include/asm/prom.h
@@ -1,6 +1,4 @@
 #ifndef _XTENSA_ASM_PROM_H
 #define _XTENSA_ASM_PROM_H
 
-#define HAVE_ARCH_DEVTREE_FIXUPS
-
 #endif /* _XTENSA_ASM_PROM_H */
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index 106a83570630..70779b2fc209 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -14,16 +14,13 @@
 #include <linux/of.h>
 #include <linux/export.h>
 #include <linux/slab.h>
-#include <asm/prom.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
 static inline void set_node_proc_entry(struct device_node *np,
 				       struct proc_dir_entry *de)
 {
-#ifdef HAVE_ARCH_DEVTREE_FIXUPS
 	np->pde = de;
-#endif
 }
 
 static struct proc_dir_entry *proc_device_tree;
-- 
cgit v1.3


From e34ecee2ae791df674dfb466ce40692ca6218e43 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Thu, 10 Oct 2013 19:31:47 -0700
Subject: aio: Fix a trinity splat

aio kiocb refcounting was broken - it was relying on keeping track of
the number of available ring buffer entries, which it needs to do
anyways; then at shutdown time it'd wait for completions to be delivered
until the # of available ring buffer entries equalled what it was
initialized to.

Problem with  that is that the ring buffer is mapped writable into
userspace, so userspace could futz with the head and tail pointers to
cause the kernel to see extra completions, and cause free_ioctx() to
return while there were still outstanding kiocbs. Which would be bad.

Fix is just to directly refcount the kiocbs - which is more
straightforward, and with the new percpu refcounting code doesn't cost
us any cacheline bouncing which was the whole point of the original
scheme.

Also clean up ioctx_alloc()'s error path and fix a bug where it wasn't
subtracting from aio_nr if ioctx_add_table() failed.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 fs/aio.c | 129 ++++++++++++++++++++++++---------------------------------------
 1 file changed, 48 insertions(+), 81 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 067e3d340c35..ee77dc13d5b2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -80,6 +80,8 @@ struct kioctx {
 	struct percpu_ref	users;
 	atomic_t		dead;
 
+	struct percpu_ref	reqs;
+
 	unsigned long		user_id;
 
 	struct __percpu kioctx_cpu *cpu;
@@ -107,7 +109,6 @@ struct kioctx {
 	struct page		**ring_pages;
 	long			nr_pages;
 
-	struct rcu_head		rcu_head;
 	struct work_struct	free_work;
 
 	struct {
@@ -412,26 +413,34 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb)
 	return cancel(kiocb);
 }
 
-static void free_ioctx_rcu(struct rcu_head *head)
+static void free_ioctx(struct work_struct *work)
 {
-	struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
+	struct kioctx *ctx = container_of(work, struct kioctx, free_work);
+
+	pr_debug("freeing %p\n", ctx);
 
+	aio_free_ring(ctx);
 	free_percpu(ctx->cpu);
 	kmem_cache_free(kioctx_cachep, ctx);
 }
 
+static void free_ioctx_reqs(struct percpu_ref *ref)
+{
+	struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
+
+	INIT_WORK(&ctx->free_work, free_ioctx);
+	schedule_work(&ctx->free_work);
+}
+
 /*
  * When this function runs, the kioctx has been removed from the "hash table"
  * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
  * now it's safe to cancel any that need to be.
  */
-static void free_ioctx(struct work_struct *work)
+static void free_ioctx_users(struct percpu_ref *ref)
 {
-	struct kioctx *ctx = container_of(work, struct kioctx, free_work);
-	struct aio_ring *ring;
+	struct kioctx *ctx = container_of(ref, struct kioctx, users);
 	struct kiocb *req;
-	unsigned cpu, avail;
-	DEFINE_WAIT(wait);
 
 	spin_lock_irq(&ctx->ctx_lock);
 
@@ -445,54 +454,8 @@ static void free_ioctx(struct work_struct *work)
 
 	spin_unlock_irq(&ctx->ctx_lock);
 
-	for_each_possible_cpu(cpu) {
-		struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu);
-
-		atomic_add(kcpu->reqs_available, &ctx->reqs_available);
-		kcpu->reqs_available = 0;
-	}
-
-	while (1) {
-		prepare_to_wait(&ctx->wait, &wait, TASK_UNINTERRUPTIBLE);
-
-		ring = kmap_atomic(ctx->ring_pages[0]);
-		avail = (ring->head <= ring->tail)
-			 ? ring->tail - ring->head
-			 : ctx->nr_events - ring->head + ring->tail;
-
-		atomic_add(avail, &ctx->reqs_available);
-		ring->head = ring->tail;
-		kunmap_atomic(ring);
-
-		if (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1)
-			break;
-
-		schedule();
-	}
-	finish_wait(&ctx->wait, &wait);
-
-	WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1);
-
-	aio_free_ring(ctx);
-
-	pr_debug("freeing %p\n", ctx);
-
-	/*
-	 * Here the call_rcu() is between the wait_event() for reqs_active to
-	 * hit 0, and freeing the ioctx.
-	 *
-	 * aio_complete() decrements reqs_active, but it has to touch the ioctx
-	 * after to issue a wakeup so we use rcu.
-	 */
-	call_rcu(&ctx->rcu_head, free_ioctx_rcu);
-}
-
-static void free_ioctx_ref(struct percpu_ref *ref)
-{
-	struct kioctx *ctx = container_of(ref, struct kioctx, users);
-
-	INIT_WORK(&ctx->free_work, free_ioctx);
-	schedule_work(&ctx->free_work);
+	percpu_ref_kill(&ctx->reqs);
+	percpu_ref_put(&ctx->reqs);
 }
 
 static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
@@ -551,6 +514,16 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
 	}
 }
 
+static void aio_nr_sub(unsigned nr)
+{
+	spin_lock(&aio_nr_lock);
+	if (WARN_ON(aio_nr - nr > aio_nr))
+		aio_nr = 0;
+	else
+		aio_nr -= nr;
+	spin_unlock(&aio_nr_lock);
+}
+
 /* ioctx_alloc
  *	Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
  */
@@ -588,8 +561,11 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 
 	ctx->max_reqs = nr_events;
 
-	if (percpu_ref_init(&ctx->users, free_ioctx_ref))
-		goto out_freectx;
+	if (percpu_ref_init(&ctx->users, free_ioctx_users))
+		goto err;
+
+	if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
+		goto err;
 
 	spin_lock_init(&ctx->ctx_lock);
 	spin_lock_init(&ctx->completion_lock);
@@ -600,10 +576,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 
 	ctx->cpu = alloc_percpu(struct kioctx_cpu);
 	if (!ctx->cpu)
-		goto out_freeref;
+		goto err;
 
 	if (aio_setup_ring(ctx) < 0)
-		goto out_freepcpu;
+		goto err;
 
 	atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
 	ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
@@ -615,7 +591,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 	if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
 	    aio_nr + nr_events < aio_nr) {
 		spin_unlock(&aio_nr_lock);
-		goto out_cleanup;
+		err = -EAGAIN;
+		goto err;
 	}
 	aio_nr += ctx->max_reqs;
 	spin_unlock(&aio_nr_lock);
@@ -624,23 +601,19 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 
 	err = ioctx_add_table(ctx, mm);
 	if (err)
-		goto out_cleanup_put;
+		goto err_cleanup;
 
 	pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
 		 ctx, ctx->user_id, mm, ctx->nr_events);
 	return ctx;
 
-out_cleanup_put:
-	percpu_ref_put(&ctx->users);
-out_cleanup:
-	err = -EAGAIN;
+err_cleanup:
+	aio_nr_sub(ctx->max_reqs);
+err:
 	aio_free_ring(ctx);
-out_freepcpu:
 	free_percpu(ctx->cpu);
-out_freeref:
+	free_percpu(ctx->reqs.pcpu_count);
 	free_percpu(ctx->users.pcpu_count);
-out_freectx:
-	put_aio_ring_file(ctx);
 	kmem_cache_free(kioctx_cachep, ctx);
 	pr_debug("error allocating ioctx %d\n", err);
 	return ERR_PTR(err);
@@ -675,10 +648,7 @@ static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx)
 		 * -EAGAIN with no ioctxs actually in use (as far as userspace
 		 *  could tell).
 		 */
-		spin_lock(&aio_nr_lock);
-		BUG_ON(aio_nr - ctx->max_reqs > aio_nr);
-		aio_nr -= ctx->max_reqs;
-		spin_unlock(&aio_nr_lock);
+		aio_nr_sub(ctx->max_reqs);
 
 		if (ctx->mmap_size)
 			vm_munmap(ctx->mmap_base, ctx->mmap_size);
@@ -810,6 +780,8 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx)
 	if (unlikely(!req))
 		goto out_put;
 
+	percpu_ref_get(&ctx->reqs);
+
 	req->ki_ctx = ctx;
 	return req;
 out_put:
@@ -879,12 +851,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 		return;
 	}
 
-	/*
-	 * Take rcu_read_lock() in case the kioctx is being destroyed, as we
-	 * need to issue a wakeup after incrementing reqs_available.
-	 */
-	rcu_read_lock();
-
 	if (iocb->ki_list.next) {
 		unsigned long flags;
 
@@ -959,7 +925,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 	if (waitqueue_active(&ctx->wait))
 		wake_up(&ctx->wait);
 
-	rcu_read_unlock();
+	percpu_ref_put(&ctx->reqs);
 }
 EXPORT_SYMBOL(aio_complete);
 
@@ -1370,6 +1336,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	return 0;
 out_put_req:
 	put_reqs_available(ctx, 1);
+	percpu_ref_put(&ctx->reqs);
 	kiocb_free(req);
 	return ret;
 }
-- 
cgit v1.3


From a26a8746ce5eee6c3dd11f98f5dadb56c5f91ed7 Mon Sep 17 00:00:00 2001
From: Michael Witten <mfwitten@gmail.com>
Date: Wed, 14 Aug 2013 09:59:45 +0000
Subject: Docs: Kconfig: `devlopers' -> `developers'

Signed-off-by: Michael Witten <mfwitten@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 fs/btrfs/Kconfig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 2b3b83296977..e374bf96a442 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -59,7 +59,8 @@ config BTRFS_FS_RUN_SANITY_TESTS
 	help
 	  This will run some basic sanity tests on the free space cache
 	  code to make sure it is acting as it should.  These are mostly
-	  regression tests and are only really interesting to btrfs devlopers.
+	  regression tests and are only really interesting to btrfs
+	  developers.
 
 	  If unsure, say N.
 
-- 
cgit v1.3


From 785a162d147a547bc7a577c1c28f6fb9dbeb4f16 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 14 Oct 2013 09:27:11 -0400
Subject: sysfs: make sysfs_file_ops() follow ignore_lockdep flag

375b611e60 ("sysfs: remove sysfs_buffer->ops") introduced
sysfs_file_ops() which determines the associated file operation of a
given sysfs_dirent.  As file ops access should be protected by an
active reference, the new function includes a lockdep assertion on the
sysfs_dirent; unfortunately, I forgot to take attr->ignore_lockdep
flag into account and the lockdep assertion trips spuriously for files
which opt out from active reference lockdep checking.

# cat /sys/devices/pci0000:00/0000:00:01.2/usb1/authorized

 ------------[ cut here ]------------
 WARNING: CPU: 1 PID: 540 at /work/os/work/fs/sysfs/file.c:79 sysfs_file_ops+0x4e/0x60()
 Modules linked in:
 CPU: 1 PID: 540 Comm: cat Not tainted 3.11.0-work+ #3
 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
  0000000000000009 ffff880016205c08 ffffffff81ca0131 0000000000000000
  ffff880016205c40 ffffffff81096d0d ffff8800166cb898 ffff8800166f6f60
  ffffffff8125a220 ffff880011ab1ec0 ffff88000aff0c78 ffff880016205c50
 Call Trace:
  [<ffffffff81ca0131>] dump_stack+0x4e/0x82
  [<ffffffff81096d0d>] warn_slowpath_common+0x7d/0xa0
  [<ffffffff81096dea>] warn_slowpath_null+0x1a/0x20
  [<ffffffff8125994e>] sysfs_file_ops+0x4e/0x60
  [<ffffffff8125a274>] sysfs_open_file+0x54/0x300
  [<ffffffff811df612>] do_dentry_open.isra.17+0x182/0x280
  [<ffffffff811df820>] finish_open+0x30/0x40
  [<ffffffff811f0623>] do_last+0x503/0xd90
  [<ffffffff811f0f6b>] path_openat+0xbb/0x6d0
  [<ffffffff811f23ba>] do_filp_open+0x3a/0x90
  [<ffffffff811e09a9>] do_sys_open+0x129/0x220
  [<ffffffff811e0abe>] SyS_open+0x1e/0x20
  [<ffffffff81caf3c2>] system_call_fastpath+0x16/0x1b
 ---[ end trace aa48096b111dafdb ]---

Rename fs/sysfs/dir.c::ignore_lockdep() to sysfs_ignore_lockdep() and
move it to fs/sysfs/sysfs.h and make sysfs_file_ops() skip lockdep
assertion if sysfs_ignore_lockdep() is true.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c   | 22 ++--------------------
 fs/sysfs/file.c  |  3 ++-
 fs/sysfs/sysfs.h | 16 ++++++++++++++++
 3 files changed, 20 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index f6025c81bfd5..eab59de47556 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -144,24 +144,6 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
 		sd->s_parent->s_flags &= ~SYSFS_FLAG_HAS_NS;
 }
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-
-/* Test for attributes that want to ignore lockdep for read-locking */
-static bool ignore_lockdep(struct sysfs_dirent *sd)
-{
-	return sysfs_type(sd) == SYSFS_KOBJ_ATTR &&
-			sd->s_attr.attr->ignore_lockdep;
-}
-
-#else
-
-static inline bool ignore_lockdep(struct sysfs_dirent *sd)
-{
-	return true;
-}
-
-#endif
-
 /**
  *	sysfs_get_active - get an active reference to sysfs_dirent
  *	@sd: sysfs_dirent to get an active reference to
@@ -180,7 +162,7 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
 	if (!atomic_inc_unless_negative(&sd->s_active))
 		return NULL;
 
-	if (likely(!ignore_lockdep(sd)))
+	if (likely(!sysfs_ignore_lockdep(sd)))
 		rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
 	return sd;
 }
@@ -199,7 +181,7 @@ void sysfs_put_active(struct sysfs_dirent *sd)
 	if (unlikely(!sd))
 		return;
 
-	if (likely(!ignore_lockdep(sd)))
+	if (likely(!sysfs_ignore_lockdep(sd)))
 		rwsem_release(&sd->dep_map, 1, _RET_IP_);
 	v = atomic_dec_return(&sd->s_active);
 	if (likely(v != SD_DEACTIVATED_BIAS))
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 5f7a955550de..c324ee906df0 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -76,7 +76,8 @@ static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd)
 {
 	struct kobject *kobj = sd->s_parent->s_dir.kobj;
 
-	lockdep_assert_held(sd);
+	if (!sysfs_ignore_lockdep(sd))
+		lockdep_assert_held(sd);
 	return kobj->ktype ? kobj->ktype->sysfs_ops : NULL;
 }
 
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 4e01d3b3909c..94ac8fa34dd4 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -103,6 +103,7 @@ static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
 }
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
+
 #define sysfs_dirent_init_lockdep(sd)				\
 do {								\
 	struct attribute *attr = sd->s_attr.attr;		\
@@ -112,8 +113,23 @@ do {								\
 								\
 	lockdep_init_map(&sd->dep_map, "s_active", key, 0);	\
 } while (0)
+
+/* Test for attributes that want to ignore lockdep for read-locking */
+static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd)
+{
+	return sysfs_type(sd) == SYSFS_KOBJ_ATTR &&
+		sd->s_attr.attr->ignore_lockdep;
+}
+
 #else
+
 #define sysfs_dirent_init_lockdep(sd) do {} while (0)
+
+static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd)
+{
+	return true;
+}
+
 #endif
 
 /*
-- 
cgit v1.3


From d723a92dd465d549bf79dd481c09d59f0be02936 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 10 Oct 2013 18:03:55 +1100
Subject: sysfs/bin: Fix size handling overflow for bin_attribute

While looking at the code, I noticed that bin_attribute read() and write()
ops copy the inode size into an int for futher comparisons.

Some bin_attributes can be fairly large. For example, pci creates some for
BARs set to the BAR size and giant BARs are around the corner, so this is
going to break something somewhere eventually.

Let's use the right type.

[adjust for seqfile conversions, only needed for bin_read() - gkh]

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index c324ee906df0..5d818df7250b 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -156,7 +156,7 @@ static ssize_t sysfs_bin_read(struct file *file, char __user *userbuf,
 	struct sysfs_open_file *of = sysfs_of(file);
 	struct bin_attribute *battr = of->sd->s_bin_attr.bin_attr;
 	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
-	int size = file_inode(file)->i_size;
+	loff_t size = file_inode(file)->i_size;
 	int count = min_t(size_t, bytes, PAGE_SIZE);
 	loff_t offs = *off;
 	char *buf;
-- 
cgit v1.3


From e66cf161098a634dc96e32d0089c5767cf25668a Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 15 Oct 2013 15:18:08 +0100
Subject: GFS2: Use lockref for glocks

Currently glocks have an atomic reference count and also a spinlock
which covers various internal fields, such as the state. This intent of
this patch is to replace the spinlock and the atomic reference count
with a lockref structure. This contains a spinlock which we can continue
to use as before, and a reference counter which is used in conjuction
with the spinlock to replace the previous atomic counter.

As a result of this there are some new rules for reference counting on
glocks. We need to distinguish between reference count changes under
gl_spin (which are now just increment or decrement of the new counter,
provided the count cannot hit zero) and those which are outside of
gl_spin, but which now take gl_spin internally.

The conversion is relatively straight forward. There is probably some
further clean up which can be done, but the priority at this stage is to
make the change in as simple a manner as possible.

A consequence of this change is that the reference count is being
decoupled from the lru list processing. This should allow future
adoption of the lru_list code with glocks in due course.

The reason for using the "dead" state and not just relying on 0 being
the "invalid state" is so that in due course 0 ref counts can be
allowable. The intent is to eventually be able to remove the ref count
changes which are currently hidden away in state_change().

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c         | 83 ++++++++++++++++++++++++-------------------------
 fs/gfs2/glock.h         |  2 --
 fs/gfs2/glops.c         |  4 +--
 fs/gfs2/incore.h        |  5 +--
 include/linux/lockref.h |  6 ++++
 lib/lockref.c           |  1 +
 6 files changed, 52 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c2f41b4d00b9..e66a8009aff1 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -31,6 +31,7 @@
 #include <linux/bit_spinlock.h>
 #include <linux/percpu.h>
 #include <linux/list_sort.h>
+#include <linux/lockref.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -129,10 +130,10 @@ void gfs2_glock_free(struct gfs2_glock *gl)
  *
  */
 
-void gfs2_glock_hold(struct gfs2_glock *gl)
+static void gfs2_glock_hold(struct gfs2_glock *gl)
 {
-	GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0);
-	atomic_inc(&gl->gl_ref);
+	GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref));
+	lockref_get(&gl->gl_lockref);
 }
 
 /**
@@ -186,20 +187,6 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
 	spin_unlock(&lru_lock);
 }
 
-/**
- * gfs2_glock_put_nolock() - Decrement reference count on glock
- * @gl: The glock to put
- *
- * This function should only be used if the caller has its own reference
- * to the glock, in addition to the one it is dropping.
- */
-
-void gfs2_glock_put_nolock(struct gfs2_glock *gl)
-{
-	if (atomic_dec_and_test(&gl->gl_ref))
-		GLOCK_BUG_ON(gl, 1);
-}
-
 /**
  * gfs2_glock_put() - Decrement reference count on glock
  * @gl: The glock to put
@@ -211,17 +198,22 @@ void gfs2_glock_put(struct gfs2_glock *gl)
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct address_space *mapping = gfs2_glock2aspace(gl);
 
-	if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) {
-		__gfs2_glock_remove_from_lru(gl);
-		spin_unlock(&lru_lock);
-		spin_lock_bucket(gl->gl_hash);
-		hlist_bl_del_rcu(&gl->gl_list);
-		spin_unlock_bucket(gl->gl_hash);
-		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
-		GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
-		trace_gfs2_glock_put(gl);
-		sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
-	}
+	if (lockref_put_or_lock(&gl->gl_lockref))
+		return;
+
+	lockref_mark_dead(&gl->gl_lockref);
+
+	spin_lock(&lru_lock);
+	__gfs2_glock_remove_from_lru(gl);
+	spin_unlock(&lru_lock);
+	spin_unlock(&gl->gl_lockref.lock);
+	spin_lock_bucket(gl->gl_hash);
+	hlist_bl_del_rcu(&gl->gl_list);
+	spin_unlock_bucket(gl->gl_hash);
+	GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
+	GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
+	trace_gfs2_glock_put(gl);
+	sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
 }
 
 /**
@@ -244,7 +236,7 @@ static struct gfs2_glock *search_bucket(unsigned int hash,
 			continue;
 		if (gl->gl_sbd != sdp)
 			continue;
-		if (atomic_inc_not_zero(&gl->gl_ref))
+		if (lockref_get_not_dead(&gl->gl_lockref))
 			return gl;
 	}
 
@@ -396,10 +388,11 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
 	held2 = (new_state != LM_ST_UNLOCKED);
 
 	if (held1 != held2) {
+		GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref));
 		if (held2)
-			gfs2_glock_hold(gl);
+			gl->gl_lockref.count++;
 		else
-			gfs2_glock_put_nolock(gl);
+			gl->gl_lockref.count--;
 	}
 	if (held1 && held2 && list_empty(&gl->gl_holders))
 		clear_bit(GLF_QUEUED, &gl->gl_flags);
@@ -626,9 +619,9 @@ out:
 out_sched:
 	clear_bit(GLF_LOCK, &gl->gl_flags);
 	smp_mb__after_clear_bit();
-	gfs2_glock_hold(gl);
+	gl->gl_lockref.count++;
 	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
-		gfs2_glock_put_nolock(gl);
+		gl->gl_lockref.count--;
 	return;
 
 out_unlock:
@@ -754,7 +747,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl->gl_sbd = sdp;
 	gl->gl_flags = 0;
 	gl->gl_name = name;
-	atomic_set(&gl->gl_ref, 1);
+	gl->gl_lockref.count = 1;
 	gl->gl_state = LM_ST_UNLOCKED;
 	gl->gl_target = LM_ST_UNLOCKED;
 	gl->gl_demote_state = LM_ST_EXCLUSIVE;
@@ -1356,10 +1349,10 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 		}
 	}
 
-	spin_unlock(&gl->gl_spin);
+	gl->gl_lockref.count++;
 	set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
-	smp_wmb();
-	gfs2_glock_hold(gl);
+	spin_unlock(&gl->gl_spin);
+
 	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
 		gfs2_glock_put(gl);
 }
@@ -1404,15 +1397,19 @@ __acquires(&lru_lock)
 	while(!list_empty(list)) {
 		gl = list_entry(list->next, struct gfs2_glock, gl_lru);
 		list_del_init(&gl->gl_lru);
+		if (!spin_trylock(&gl->gl_spin)) {
+			list_add(&gl->gl_lru, &lru_list);
+			atomic_inc(&lru_count);
+			continue;
+		}
 		clear_bit(GLF_LRU, &gl->gl_flags);
-		gfs2_glock_hold(gl);
 		spin_unlock(&lru_lock);
-		spin_lock(&gl->gl_spin);
+		gl->gl_lockref.count++;
 		if (demote_ok(gl))
 			handle_callback(gl, LM_ST_UNLOCKED, 0, false);
 		WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags));
 		if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
-			gfs2_glock_put_nolock(gl);
+			gl->gl_lockref.count--;
 		spin_unlock(&gl->gl_spin);
 		spin_lock(&lru_lock);
 	}
@@ -1493,7 +1490,7 @@ static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp,
 
 	rcu_read_lock();
 	hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) {
-		if ((gl->gl_sbd == sdp) && atomic_inc_not_zero(&gl->gl_ref))
+		if ((gl->gl_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref))
 			examiner(gl);
 	}
 	rcu_read_unlock();
@@ -1746,7 +1743,7 @@ int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 		  state2str(gl->gl_demote_state), dtime,
 		  atomic_read(&gl->gl_ail_count),
 		  atomic_read(&gl->gl_revokes),
-		  atomic_read(&gl->gl_ref), gl->gl_hold_time);
+		  (int)gl->gl_lockref.count, gl->gl_hold_time);
 
 	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
 		error = dump_holder(seq, gh);
@@ -1902,7 +1899,7 @@ static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
 			gi->nhash = 0;
 		}
 	/* Skip entries for other sb and dead entries */
-	} while (gi->sdp != gi->gl->gl_sbd || atomic_read(&gi->gl->gl_ref) == 0);
+	} while (gi->sdp != gi->gl->gl_sbd || __lockref_is_dead(&gl->gl_lockref));
 
 	return 0;
 }
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 69f66e3d22bf..6647d77366ba 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -181,8 +181,6 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
 extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 			  const struct gfs2_glock_operations *glops,
 			  int create, struct gfs2_glock **glp);
-extern void gfs2_glock_hold(struct gfs2_glock *gl);
-extern void gfs2_glock_put_nolock(struct gfs2_glock *gl);
 extern void gfs2_glock_put(struct gfs2_glock *gl);
 extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
 			     unsigned flags, struct gfs2_holder *gh);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index e2e0a90396e7..db908f697139 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -525,9 +525,9 @@ static void iopen_go_callback(struct gfs2_glock *gl, bool remote)
 
 	if (gl->gl_demote_state == LM_ST_UNLOCKED &&
 	    gl->gl_state == LM_ST_SHARED && ip) {
-		gfs2_glock_hold(gl);
+		gl->gl_lockref.count++;
 		if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
-			gfs2_glock_put_nolock(gl);
+			gl->gl_lockref.count--;
 	}
 }
 
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 2ab4f8d8f4c4..bb88e417231f 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -21,6 +21,7 @@
 #include <linux/rbtree.h>
 #include <linux/ktime.h>
 #include <linux/percpu.h>
+#include <linux/lockref.h>
 
 #define DIO_WAIT	0x00000010
 #define DIO_METADATA	0x00000020
@@ -321,9 +322,9 @@ struct gfs2_glock {
 	struct gfs2_sbd *gl_sbd;
 	unsigned long gl_flags;		/* GLF_... */
 	struct lm_lockname gl_name;
-	atomic_t gl_ref;
 
-	spinlock_t gl_spin;
+	struct lockref gl_lockref;
+#define gl_spin gl_lockref.lock
 
 	/* State fields protected by gl_spin */
 	unsigned int gl_state:2,	/* Current state */
diff --git a/include/linux/lockref.h b/include/linux/lockref.h
index f279ed9a9163..13dfd36a3294 100644
--- a/include/linux/lockref.h
+++ b/include/linux/lockref.h
@@ -36,4 +36,10 @@ extern int lockref_put_or_lock(struct lockref *);
 extern void lockref_mark_dead(struct lockref *);
 extern int lockref_get_not_dead(struct lockref *);
 
+/* Must be called under spinlock for reliable results */
+static inline int __lockref_is_dead(const struct lockref *l)
+{
+	return ((int)l->count < 0);
+}
+
 #endif /* __LINUX_LOCKREF_H */
diff --git a/lib/lockref.c b/lib/lockref.c
index e2cd2c0a8821..8ff162fe3413 100644
--- a/lib/lockref.c
+++ b/lib/lockref.c
@@ -136,6 +136,7 @@ void lockref_mark_dead(struct lockref *lockref)
 	assert_spin_locked(&lockref->lock);
 	lockref->count = -128;
 }
+EXPORT_SYMBOL(lockref_mark_dead);
 
 /**
  * lockref_get_not_dead - Increments count unless the ref is dead
-- 
cgit v1.3


From 78371a45dfa8f2a2f3892b983c39863a67c66b11 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 16 Oct 2013 08:25:11 -0400
Subject: ext4: fix assertion in ext4_add_complete_io()

It doesn't make sense to require io_end->handle when we are in
nojournal mode. So update the assertion accordingly to avoid false
warnings from ext4_add_complete_io().

Reported-by: Eric Whitney <enwlinux@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/page-io.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index d7d0c7b46ed4..d488f80ee32d 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -197,14 +197,15 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head)
 static void ext4_add_complete_io(ext4_io_end_t *io_end)
 {
 	struct ext4_inode_info *ei = EXT4_I(io_end->inode);
+	struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb);
 	struct workqueue_struct *wq;
 	unsigned long flags;
 
 	/* Only reserved conversions from writeback should enter here */
 	WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
-	WARN_ON(!io_end->handle);
+	WARN_ON(!io_end->handle && sbi->s_journal);
 	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-	wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
+	wq = sbi->rsv_conversion_wq;
 	if (list_empty(&ei->i_rsv_conversion_list))
 		queue_work(wq, &ei->i_rsv_conversion_work);
 	list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
-- 
cgit v1.3


From 7534e854b930a021dedf9e16396ced5e70e1aba3 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 16 Oct 2013 08:26:08 -0400
Subject: ext4: fixup kerndoc annotation of mpage_map_and_submit_extent()

Document give_up_on_write argument of mpage_map_and_submit_extent().

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e274e9c1171f..e7e5b3d8f002 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2178,6 +2178,9 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
  *
  * @handle - handle for journal operations
  * @mpd - extent to map
+ * @give_up_on_write - we set this to true iff there is a fatal error and there
+ *                     is no hope of writing the data. The caller should discard
+ *                     dirty pages to avoid infinite loops.
  *
  * The function maps extent starting at mpd->lblk of length mpd->len. If it is
  * delayed, blocks are allocated, if it is unwritten, we may need to convert
-- 
cgit v1.3


From 2046fd1873886e8c774d7c1b0961624c164e7db2 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 15 Oct 2013 18:45:17 -0500
Subject: ext3: Count journal as bsddf overhead in ext3_statfs

ext4 counts journal space as bsddf overhead, but ext3 does not.

For some reason when I patched ext4 I thought I should leave
ext3 alone, but frankly it makes more sense to fix it, I think.

Otherwise we get inconsistent behavior from ext3 under ext3.ko,
and ext3 under ext4.ko, which is not at all desirable...

This is testable by xfstests shared/289, though it will need
modification because it currently special-cases ext3.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/super.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index c50c76190373..37fd31ed16e7 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2825,6 +2825,10 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
 		 * bitmap, and an inode table.
 		 */
 		overhead += ngroups * (2 + sbi->s_itb_per_group);
+
+		/* Add the journal blocks as well */
+                overhead += sbi->s_journal->j_maxlen;
+
 		sbi->s_overhead_last = overhead;
 		smp_wmb();
 		sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
-- 
cgit v1.3


From a97f4a66d8ee4faf0f31cc4ad3f4aa0baef23fc7 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Wed, 16 Oct 2013 14:20:25 +0200
Subject: dlm: Avoid that dlm_release_lockspace() incorrectly returns -EBUSY

When dlm_release_lockspace(ls, 1) is invoked on a busy system
immediately after the last dlm_unlock() AST has finished it can occur
that lkb_idr_is_local() is invoked for the unlocked LKB since removal
from ls_lkbidr only occurs after the AST has returned. If that happens
dlm_release_lockspace(ls, 1) will return -EBUSY instead of releasing
the lockspace. Fix this race condition by changing lkb_idr_is_local()
such that it only returns true for LKB's that have not yet been
unlocked.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lockspace.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 88556dc0458e..d5abafd56a6d 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -706,9 +706,7 @@ static int lkb_idr_is_local(int id, void *p, void *data)
 {
 	struct dlm_lkb *lkb = p;
 
-	if (!lkb->lkb_nodeid)
-		return 1;
-	return 0;
+	return lkb->lkb_nodeid == 0 && lkb->lkb_grmode != DLM_LOCK_IV;
 }
 
 static int lkb_idr_is_any(int id, void *p, void *data)
-- 
cgit v1.3


From 2c6e24ce1aa6b3b147c75d488c2797ee258eb22b Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 15 Oct 2013 09:17:49 +1100
Subject: xfs: prevent deadlock trying to cover an active log

Recent analysis of a deadlocked XFS filesystem from a kernel
crash dump indicated that the filesystem was stuck waiting for log
space. The short story of the hang on the RHEL6 kernel is this:

	- the tail of the log is pinned by an inode
	- the inode has been pushed by the xfsaild
	- the inode has been flushed to it's backing buffer and is
	  currently flush locked and hence waiting for backing
	  buffer IO to complete and remove it from the AIL
	- the backing buffer is marked for write - it is on the
	  delayed write queue
	- the inode buffer has been modified directly and logged
	  recently due to unlinked inode list modification
	- the backing buffer is pinned in memory as it is in the
	  active CIL context.
	- the xfsbufd won't start buffer writeback because it is
	  pinned
	- xfssyncd won't force the log because it sees the log as
	  needing to be covered and hence wants to issue a dummy
	  transaction to move the log covering state machine along.

Hence there is no trigger to force the CIL to the log and hence
unpin the inode buffer and therefore complete the inode IO, remove
it from the AIL and hence move the tail of the log along, allowing
transactions to start again.

Mainline kernels also have the same deadlock, though the signature
is slightly different - the inode buffer never reaches the delayed
write lists because xfs_buf_item_push() sees that it is pinned and
hence never adds it to the delayed write list that the xfsaild
flushes.

There are two possible solutions here. The first is to simply force
the log before trying to cover the log and so ensure that the CIL is
emptied before we try to reserve space for the dummy transaction in
the xfs_log_worker(). While this might work most of the time, it is
still racy and is no guarantee that we don't get stuck in
xfs_trans_reserve waiting for log space to come free. Hence it's not
the best way to solve the problem.

The second solution is to modify xfs_log_need_covered() to be aware
of the CIL. We only should be attempting to cover the log if there
is no current activity in the log - covering the log is the process
of ensuring that the head and tail in the log on disk are identical
(i.e. the log is clean and at idle). Hence, by definition, if there
are items in the CIL then the log is not at idle and so we don't
need to attempt to cover it.

When we don't need to cover the log because it is active or idle, we
issue a log force from xfs_log_worker() - if the log is idle, then
this does nothing.  However, if the log is active due to there being
items in the CIL, it will force the items in the CIL to the log and
unpin them.

In the case of the above deadlock scenario, instead of
xfs_log_worker() getting stuck in xfs_trans_reserve() attempting to
cover the log, it will instead force the log, thereby unpinning the
inode buffer, allowing IO to be issued and complete and hence
removing the inode that was pinning the tail of the log from the
AIL. At that point, everything will start moving along again. i.e.
the xfs_log_worker turns back into a watchdog that can alleviate
deadlocks based around pinned items that prevent the tail of the log
from being moved...

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_log.c      | 48 +++++++++++++++++++++++++++++-------------------
 fs/xfs/xfs_log_cil.c  | 14 ++++++++++++++
 fs/xfs/xfs_log_priv.h | 10 ++++------
 3 files changed, 47 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index a2dea108071a..613ed9414e70 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1000,27 +1000,34 @@ xfs_log_space_wake(
 }
 
 /*
- * Determine if we have a transaction that has gone to disk
- * that needs to be covered. To begin the transition to the idle state
- * firstly the log needs to be idle (no AIL and nothing in the iclogs).
- * If we are then in a state where covering is needed, the caller is informed
- * that dummy transactions are required to move the log into the idle state.
+ * Determine if we have a transaction that has gone to disk that needs to be
+ * covered. To begin the transition to the idle state firstly the log needs to
+ * be idle. That means the CIL, the AIL and the iclogs needs to be empty before
+ * we start attempting to cover the log.
  *
- * Because this is called as part of the sync process, we should also indicate
- * that dummy transactions should be issued in anything but the covered or
- * idle states. This ensures that the log tail is accurately reflected in
- * the log at the end of the sync, hence if a crash occurrs avoids replay
- * of transactions where the metadata is already on disk.
+ * Only if we are then in a state where covering is needed, the caller is
+ * informed that dummy transactions are required to move the log into the idle
+ * state.
+ *
+ * If there are any items in the AIl or CIL, then we do not want to attempt to
+ * cover the log as we may be in a situation where there isn't log space
+ * available to run a dummy transaction and this can lead to deadlocks when the
+ * tail of the log is pinned by an item that is modified in the CIL.  Hence
+ * there's no point in running a dummy transaction at this point because we
+ * can't start trying to idle the log until both the CIL and AIL are empty.
  */
 int
 xfs_log_need_covered(xfs_mount_t *mp)
 {
-	int		needed = 0;
 	struct xlog	*log = mp->m_log;
+	int		needed = 0;
 
 	if (!xfs_fs_writable(mp))
 		return 0;
 
+	if (!xlog_cil_empty(log))
+		return 0;
+
 	spin_lock(&log->l_icloglock);
 	switch (log->l_covered_state) {
 	case XLOG_STATE_COVER_DONE:
@@ -1029,14 +1036,17 @@ xfs_log_need_covered(xfs_mount_t *mp)
 		break;
 	case XLOG_STATE_COVER_NEED:
 	case XLOG_STATE_COVER_NEED2:
-		if (!xfs_ail_min_lsn(log->l_ailp) &&
-		    xlog_iclogs_empty(log)) {
-			if (log->l_covered_state == XLOG_STATE_COVER_NEED)
-				log->l_covered_state = XLOG_STATE_COVER_DONE;
-			else
-				log->l_covered_state = XLOG_STATE_COVER_DONE2;
-		}
-		/* FALLTHRU */
+		if (xfs_ail_min_lsn(log->l_ailp))
+			break;
+		if (!xlog_iclogs_empty(log))
+			break;
+
+		needed = 1;
+		if (log->l_covered_state == XLOG_STATE_COVER_NEED)
+			log->l_covered_state = XLOG_STATE_COVER_DONE;
+		else
+			log->l_covered_state = XLOG_STATE_COVER_DONE2;
+		break;
 	default:
 		needed = 1;
 		break;
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index cfe97973ba36..da8524e779b6 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -711,6 +711,20 @@ xlog_cil_push_foreground(
 	xlog_cil_push(log);
 }
 
+bool
+xlog_cil_empty(
+	struct xlog	*log)
+{
+	struct xfs_cil	*cil = log->l_cilp;
+	bool		empty = false;
+
+	spin_lock(&cil->xc_push_lock);
+	if (list_empty(&cil->xc_cil))
+		empty = true;
+	spin_unlock(&cil->xc_push_lock);
+	return empty;
+}
+
 /*
  * Commit a transaction with the given vector to the Committed Item List.
  *
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 136654b9400d..f80cff26fda9 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -514,12 +514,10 @@ xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
 /*
  * Committed Item List interfaces
  */
-int
-xlog_cil_init(struct xlog *log);
-void
-xlog_cil_init_post_recovery(struct xlog *log);
-void
-xlog_cil_destroy(struct xlog *log);
+int	xlog_cil_init(struct xlog *log);
+void	xlog_cil_init_post_recovery(struct xlog *log);
+void	xlog_cil_destroy(struct xlog *log);
+bool	xlog_cil_empty(struct xlog *log);
 
 /*
  * CIL force routines
-- 
cgit v1.3


From 08e96e1a3c5fd823f846df813b0b8be8e734c6c2 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Fri, 11 Oct 2013 20:59:05 -0500
Subject: xfs: remove newlines from strings passed to __xfs_printk

__xfs_printk adds its own "\n".  Having it in the original string
leads to unintentional blank lines from these messages.

Most format strings have no newline, but a few do, leading to
i.e.:

[ 7347.119911] XFS (sdb2): Access to block zero in inode 132 start_block: 0 start_off: 0 blkcnt: 0 extent-state: 0 lastx: 1a05
[ 7347.119911]
[ 7347.119919] XFS (sdb2): Access to block zero in inode 132 start_block: 0 start_off: 0 blkcnt: 0 extent-state: 0 lastx: 1a05
[ 7347.119919]

Fix them all.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_bmap.c        |  2 +-
 fs/xfs/xfs_buf.c         |  6 +++---
 fs/xfs/xfs_dir2_node.c   |  2 +-
 fs/xfs/xfs_error.c       |  2 +-
 fs/xfs/xfs_iomap.c       |  2 +-
 fs/xfs/xfs_log.c         |  2 +-
 fs/xfs/xfs_log_recover.c |  6 +++---
 fs/xfs/xfs_qm_syscalls.c | 12 ++++++------
 fs/xfs/xfs_sb.c          |  4 ++--
 fs/xfs/xfs_super.c       |  2 +-
 10 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index f47e65c30be6..e6b42026f9d4 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1482,7 +1482,7 @@ xfs_bmap_search_extents(
 		xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
 				"Access to block zero in inode %llu "
 				"start_block: %llx start_off: %llx "
-				"blkcnt: %llx extent-state: %x lastx: %x\n",
+				"blkcnt: %llx extent-state: %x lastx: %x",
 			(unsigned long long)ip->i_ino,
 			(unsigned long long)gotp->br_startblock,
 			(unsigned long long)gotp->br_startoff,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 263470075ea2..75e4ea7b109a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -590,7 +590,7 @@ found:
 		error = _xfs_buf_map_pages(bp, flags);
 		if (unlikely(error)) {
 			xfs_warn(target->bt_mount,
-				"%s: failed to map pages\n", __func__);
+				"%s: failed to map pagesn", __func__);
 			xfs_buf_relse(bp);
 			return NULL;
 		}
@@ -809,7 +809,7 @@ xfs_buf_get_uncached(
 	error = _xfs_buf_map_pages(bp, 0);
 	if (unlikely(error)) {
 		xfs_warn(target->bt_mount,
-			"%s: failed to map pages\n", __func__);
+			"%s: failed to map pages", __func__);
 		goto fail_free_mem;
 	}
 
@@ -1618,7 +1618,7 @@ xfs_setsize_buftarg_flags(
 		bdevname(btp->bt_bdev, name);
 
 		xfs_warn(btp->bt_mount,
-			"Cannot set_blocksize to %u on device %s\n",
+			"Cannot set_blocksize to %u on device %s",
 			sectorsize, name);
 		return EINVAL;
 	}
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 4c3dba7ffb74..34df052683eb 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -1101,7 +1101,7 @@ xfs_dir2_leafn_rebalance(
 		state->inleaf = 1;
 		blk2->index = 0;
 		xfs_alert(args->dp->i_mount,
-	"%s: picked the wrong leaf? reverting original leaf: blk1->index %d\n",
+	"%s: picked the wrong leaf? reverting original leaf: blk1->index %d",
 			__func__, blk1->index);
 	}
 }
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 1123d93ff795..40f2985a27bc 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -159,7 +159,7 @@ xfs_error_report(
 {
 	if (level <= xfs_error_level) {
 		xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
-		"Internal error %s at line %d of file %s.  Caller 0x%p\n",
+		"Internal error %s at line %d of file %s.  Caller 0x%p",
 			    tag, linenum, filename, ra);
 
 		xfs_stack_trace();
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 521fdf2ac6da..338b33263c56 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -110,7 +110,7 @@ xfs_alert_fsblock_zero(
 	xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
 			"Access to block zero in inode %llu "
 			"start_block: %llx start_off: %llx "
-			"blkcnt: %llx extent-state: %x\n",
+			"blkcnt: %llx extent-state: %x",
 		(unsigned long long)ip->i_ino,
 		(unsigned long long)imap->br_startblock,
 		(unsigned long long)imap->br_startoff,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 613ed9414e70..0461f0872244 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1989,7 +1989,7 @@ xlog_print_tic_res(
 
 	for (i = 0; i < ticket->t_res_num; i++) {
 		uint r_type = ticket->t_res_arr[i].r_type;
-		xfs_warn(mp, "region[%u]: %s - %u bytes\n", i,
+		xfs_warn(mp, "region[%u]: %s - %u bytes", i,
 			    ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
 			    "bad-rtype" : res_type_str[r_type-1]),
 			    ticket->t_res_arr[i].r_len);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 39797490a1f1..e7ca48e8f515 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -305,9 +305,9 @@ xlog_header_check_dump(
 	xfs_mount_t		*mp,
 	xlog_rec_header_t	*head)
 {
-	xfs_debug(mp, "%s:  SB : uuid = %pU, fmt = %d\n",
+	xfs_debug(mp, "%s:  SB : uuid = %pU, fmt = %d",
 		__func__, &mp->m_sb.sb_uuid, XLOG_FMT);
-	xfs_debug(mp, "    log : uuid = %pU, fmt = %d\n",
+	xfs_debug(mp, "    log : uuid = %pU, fmt = %d",
 		&head->h_fs_uuid, be32_to_cpu(head->h_fmt));
 }
 #else
@@ -4077,7 +4077,7 @@ xlog_unpack_data_crc(
 	if (crc != rhead->h_crc) {
 		if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
 			xfs_alert(log->l_mp,
-		"log record CRC mismatch: found 0x%x, expected 0x%x.\n",
+		"log record CRC mismatch: found 0x%x, expected 0x%x.",
 					le32_to_cpu(rhead->h_crc),
 					le32_to_cpu(crc));
 			xfs_hex_dump(dp, 32);
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 8174aad0b388..fb6709191811 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -287,7 +287,7 @@ xfs_qm_scall_trunc_qfiles(
 	int		error = 0, error2 = 0;
 
 	if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
-		xfs_debug(mp, "%s: flags=%x m_qflags=%x\n",
+		xfs_debug(mp, "%s: flags=%x m_qflags=%x",
 			__func__, flags, mp->m_qflags);
 		return XFS_ERROR(EINVAL);
 	}
@@ -325,7 +325,7 @@ xfs_qm_scall_quotaon(
 	sbflags = 0;
 
 	if (flags == 0) {
-		xfs_debug(mp, "%s: zero flags, m_qflags=%x\n",
+		xfs_debug(mp, "%s: zero flags, m_qflags=%x",
 			__func__, mp->m_qflags);
 		return XFS_ERROR(EINVAL);
 	}
@@ -348,7 +348,7 @@ xfs_qm_scall_quotaon(
 	     (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
 	     (flags & XFS_PQUOTA_ENFD))) {
 		xfs_debug(mp,
-			"%s: Can't enforce without acct, flags=%x sbflags=%x\n",
+			"%s: Can't enforce without acct, flags=%x sbflags=%x",
 			__func__, flags, mp->m_sb.sb_qflags);
 		return XFS_ERROR(EINVAL);
 	}
@@ -648,7 +648,7 @@ xfs_qm_scall_setqlim(
 			q->qi_bsoftlimit = soft;
 		}
 	} else {
-		xfs_debug(mp, "blkhard %Ld < blksoft %Ld\n", hard, soft);
+		xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft);
 	}
 	hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
 		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
@@ -664,7 +664,7 @@ xfs_qm_scall_setqlim(
 			q->qi_rtbsoftlimit = soft;
 		}
 	} else {
-		xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
+		xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft);
 	}
 
 	hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
@@ -681,7 +681,7 @@ xfs_qm_scall_setqlim(
 			q->qi_isoftlimit = soft;
 		}
 	} else {
-		xfs_debug(mp, "ihard %Ld < isoft %Ld\n", hard, soft);
+		xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft);
 	}
 
 	/*
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
index a5b59d92eb70..a89d0bcc7959 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/xfs_sb.c
@@ -249,13 +249,13 @@ xfs_mount_validate_sb(
 	if (xfs_sb_version_has_pquotino(sbp)) {
 		if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
 			xfs_notice(mp,
-			   "Version 5 of Super block has XFS_OQUOTA bits.\n");
+			   "Version 5 of Super block has XFS_OQUOTA bits.");
 			return XFS_ERROR(EFSCORRUPTED);
 		}
 	} else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
 				XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
 			xfs_notice(mp,
-"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.\n");
+"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.");
 			return XFS_ERROR(EFSCORRUPTED);
 	}
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 938e50fe6c61..c2e3e9535b26 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1241,7 +1241,7 @@ xfs_fs_remount(
 			 */
 #if 0
 			xfs_info(mp,
-		"mount option \"%s\" not supported for remount\n", p);
+		"mount option \"%s\" not supported for remount", p);
 			return -EINVAL;
 #else
 			break;
-- 
cgit v1.3


From 31625f28ad7be67701dc4cefcf52087addd88af4 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Fri, 11 Oct 2013 14:12:31 -0500
Subject: xfs: don't emit corruption noise on fs probes

If we get EWRONGFS due to probing of non-xfs filesystems,
there's no need to issue the scary corruption error and backtrace.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_sb.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
index a89d0bcc7959..b4762279bb78 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/xfs_sb.c
@@ -624,8 +624,9 @@ xfs_sb_read_verify(
 
 out_error:
 	if (error) {
-		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
-				     mp, bp->b_addr);
+		if (error != EWRONGFS)
+			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+					     mp, bp->b_addr);
 		xfs_buf_ioerror(bp, error);
 	}
 }
-- 
cgit v1.3


From 59e5a0e821d838854b3afd030d31f82cee3ecd58 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Fri, 11 Oct 2013 14:14:05 -0500
Subject: xfs: don't break from growfs ag update loop on error

When xfs_growfs_data_private() is updating backup superblocks,
it bails out on the first error encountered, whether reading or
writing:

* If we get an error writing out the alternate superblocks,
* just issue a warning and continue.  The real work is
* already done and committed.

This can cause a problem later during repair, because repair
looks at all superblocks, and picks the most prevalent one
as correct.  If we bail out early in the backup superblock
loop, we can end up with more "bad" matching superblocks than
good, and a post-growfs repair may revert the filesystem to
the old geometry.

With the combination of superblock verifiers and old bugs,
we're more likely to encounter read errors due to verification.

And perhaps even worse, we don't even properly write any of the
newly-added superblocks in the new AGs.

Even with this change, growfs will still say:

  xfs_growfs: XFS_IOC_FSGROWFSDATA xfsctl failed: Structure needs cleaning
  data blocks changed from 319815680 to 335216640

which might be confusing to the user, but it at least communicates
that something has gone wrong, and dmesg will probably highlight
the need for an xfs_repair.

And this is still best-effort; if verifiers fail on more than
half the backup supers, they may still "win" - but that's probably
best left to repair to more gracefully handle by doing its own
strict verification as part of the backup super "voting."

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Acked-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_fsops.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index fdae4ec5f21b..76c7b2b4fa8d 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -155,7 +155,7 @@ xfs_growfs_data_private(
 	xfs_buf_t		*bp;
 	int			bucket;
 	int			dpct;
-	int			error;
+	int			error, saved_error = 0;
 	xfs_agnumber_t		nagcount;
 	xfs_agnumber_t		nagimax = 0;
 	xfs_rfsblock_t		nb, nb_mod;
@@ -498,29 +498,33 @@ xfs_growfs_data_private(
 				error = ENOMEM;
 		}
 
+		/*
+		 * If we get an error reading or writing alternate superblocks,
+		 * continue.  xfs_repair chooses the "best" superblock based
+		 * on most matches; if we break early, we'll leave more
+		 * superblocks un-updated than updated, and xfs_repair may
+		 * pick them over the properly-updated primary.
+		 */
 		if (error) {
 			xfs_warn(mp,
 		"error %d reading secondary superblock for ag %d",
 				error, agno);
-			break;
+			saved_error = error;
+			continue;
 		}
 		xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS);
 
-		/*
-		 * If we get an error writing out the alternate superblocks,
-		 * just issue a warning and continue.  The real work is
-		 * already done and committed.
-		 */
 		error = xfs_bwrite(bp);
 		xfs_buf_relse(bp);
 		if (error) {
 			xfs_warn(mp,
 		"write error %d updating secondary superblock for ag %d",
 				error, agno);
-			break; /* no point in continuing */
+			saved_error = error;
+			continue;
 		}
 	}
-	return error;
+	return saved_error ? saved_error : error;
 
  error0:
 	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
-- 
cgit v1.3


From aeac589a74b91c4c07458272767e089810fbd23d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Thu, 17 Oct 2013 18:56:16 -0400
Subject: ext4: fix performance regression in ext4_writepages

Commit 4e7ea81db5(ext4: restructure writeback path) introduces another
performance regression on random write:

- one more page may be added to ext4 extent in
  mpage_prepare_extent_to_map, and will be submitted for I/O so
  nr_to_write will become -1 before 'done' is set

- the worse thing is that dirty pages may still be retrieved from page
  cache after nr_to_write becomes negative, so lots of small chunks
  can be submitted to block device when page writeback is catching up
  with write path, and performance is hurted.

On one arm A15 board with sata 3.0 SSD(CPU: 1.5GHz dura core, RAM:
2GB, SATA controller: 3.0Gbps), this patch can improve below test's
result from 157MB/sec to 174MB/sec(>10%):

	dd if=/dev/zero of=./z.img bs=8K count=512K

The above test is actually prototype of block write in bonnie++
utility.

This patch makes sure no more pages than nr_to_write can be added to
extent for mapping, so that nr_to_write won't become negative.

Cc: linux-ext4@vger.kernel.org
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e7e5b3d8f002..94aac67b55c9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2298,6 +2298,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 	struct address_space *mapping = mpd->inode->i_mapping;
 	struct pagevec pvec;
 	unsigned int nr_pages;
+	long left = mpd->wbc->nr_to_write;
 	pgoff_t index = mpd->first_page;
 	pgoff_t end = mpd->last_page;
 	int tag;
@@ -2333,6 +2334,17 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 			if (page->index > end)
 				goto out;
 
+			/*
+			 * Accumulated enough dirty pages? This doesn't apply
+			 * to WB_SYNC_ALL mode. For integrity sync we have to
+			 * keep going because someone may be concurrently
+			 * dirtying pages, and we might have synced a lot of
+			 * newly appeared dirty pages, but have not synced all
+			 * of the old dirty pages.
+			 */
+			if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0)
+				goto out;
+
 			/* If we can't merge this page, we are done. */
 			if (mpd->map.m_len > 0 && mpd->next_page != page->index)
 				goto out;
@@ -2367,19 +2379,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 			if (err <= 0)
 				goto out;
 			err = 0;
-
-			/*
-			 * Accumulated enough dirty pages? This doesn't apply
-			 * to WB_SYNC_ALL mode. For integrity sync we have to
-			 * keep going because someone may be concurrently
-			 * dirtying pages, and we might have synced a lot of
-			 * newly appeared dirty pages, but have not synced all
-			 * of the old dirty pages.
-			 */
-			if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
-			    mpd->next_page - mpd->first_page >=
-							mpd->wbc->nr_to_write)
-				goto out;
+			left--;
 		}
 		pagevec_release(&pvec);
 		cond_resched();
-- 
cgit v1.3


From 3d1e38073b3f33d31df1ae09aaccab3dfd9fb61e Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Tue, 8 Oct 2013 18:01:51 +0900
Subject: f2fs: fix to store and retrieve i_rdev correctly

When storing i_rdev, we should check its file type.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/inode.c | 49 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index c8c058024af8..7377ca3ce5c5 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -37,6 +37,31 @@ void f2fs_set_inode_flags(struct inode *inode)
 		inode->i_flags |= S_DIRSYNC;
 }
 
+static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
+{
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+			S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+		if (ri->i_addr[0])
+			inode->i_rdev = old_decode_dev(le32_to_cpu(ri->i_addr[0]));
+		else
+			inode->i_rdev = new_decode_dev(le32_to_cpu(ri->i_addr[1]));
+	}
+}
+
+static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
+{
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		if (old_valid_dev(inode->i_rdev)) {
+			ri->i_addr[0] = cpu_to_le32(old_encode_dev(inode->i_rdev));
+			ri->i_addr[1] = 0;
+		} else {
+			ri->i_addr[0] = 0;
+			ri->i_addr[1] = cpu_to_le32(new_encode_dev(inode->i_rdev));
+			ri->i_addr[2] = 0;
+		}
+	}
+}
+
 static int do_read_inode(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -73,10 +98,6 @@ static int do_read_inode(struct inode *inode)
 	inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
 	inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
 	inode->i_generation = le32_to_cpu(ri->i_generation);
-	if (ri->i_addr[0])
-		inode->i_rdev = old_decode_dev(le32_to_cpu(ri->i_addr[0]));
-	else
-		inode->i_rdev = new_decode_dev(le32_to_cpu(ri->i_addr[1]));
 
 	fi->i_current_depth = le32_to_cpu(ri->i_current_depth);
 	fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid);
@@ -84,8 +105,13 @@ static int do_read_inode(struct inode *inode)
 	fi->flags = 0;
 	fi->i_advise = ri->i_advise;
 	fi->i_pino = le32_to_cpu(ri->i_pino);
+
 	get_extent_info(&fi->ext, ri->i_ext);
 	get_inline_info(fi, ri);
+
+	/* get rdev by using inline_info */
+	__get_inode_rdev(inode, ri);
+
 	f2fs_put_page(node_page, 1);
 	return 0;
 }
@@ -179,21 +205,10 @@ void update_inode(struct inode *inode, struct page *node_page)
 	ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino);
 	ri->i_generation = cpu_to_le32(inode->i_generation);
 
-	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
-		if (old_valid_dev(inode->i_rdev)) {
-			ri->i_addr[0] =
-				cpu_to_le32(old_encode_dev(inode->i_rdev));
-			ri->i_addr[1] = 0;
-		} else {
-			ri->i_addr[0] = 0;
-			ri->i_addr[1] =
-				cpu_to_le32(new_encode_dev(inode->i_rdev));
-			ri->i_addr[2] = 0;
-		}
-	}
-
+	__set_inode_rdev(inode, ri);
 	set_cold_node(inode, node_page);
 	set_page_dirty(node_page);
+
 	clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
 }
 
-- 
cgit v1.3


From b1838f8952123842b00d6f8979e0f19a3b680e87 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Thu, 10 Oct 2013 14:36:00 +0900
Subject: f2fs: fix the starvation problem on cp_rwsem

This patch removes the logic previously introduced to address the starvation
on cp_rwsem.

One potential there-in bug is that we should cover the wait.list with spin_lock,
but the previous code broke this rule.

And, actually current rwsem handles this starvation issue reasonably, so that we
didn't need to do this before neither.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/f2fs.h  | 13 -------------
 fs/f2fs/super.c |  1 -
 2 files changed, 14 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index a955a59dfdbe..308967b71674 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -18,8 +18,6 @@
 #include <linux/crc32.h>
 #include <linux/magic.h>
 #include <linux/kobject.h>
-#include <linux/wait.h>
-#include <linux/sched.h>
 
 /*
  * For mount options
@@ -360,7 +358,6 @@ struct f2fs_sb_info {
 	struct inode *meta_inode;		/* cache meta blocks */
 	struct mutex cp_mutex;			/* checkpoint procedure lock */
 	struct rw_semaphore cp_rwsem;		/* blocking FS operations */
-	wait_queue_head_t cp_wait;		/* checkpoint wait queue */
 	struct mutex node_write;		/* locking node writes */
 	struct mutex writepages;		/* mutex for writepages() */
 	int por_doing;				/* recovery is doing or not */
@@ -516,13 +513,6 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
 
 static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
 {
-	/*
-	 * If the checkpoint thread is waiting for cp_rwsem, add cuurent task
-	 * into wait list to avoid the checkpoint thread starvation
-	 */
-	while (!list_empty(&sbi->cp_rwsem.wait_list))
-		wait_event_interruptible(sbi->cp_wait,
-				list_empty(&sbi->cp_rwsem.wait_list));
 	down_read(&sbi->cp_rwsem);
 }
 
@@ -539,9 +529,6 @@ static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
 static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
 {
 	up_write(&sbi->cp_rwsem);
-
-	/* wake up all tasks blocked by checkpoint */
-	wake_up_all(&sbi->cp_wait);
 }
 
 /*
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 539c009d4138..3b786c85b5cd 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -841,7 +841,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 	spin_lock_init(&sbi->stat_lock);
 	init_rwsem(&sbi->bio_sem);
 	init_rwsem(&sbi->cp_rwsem);
-	init_waitqueue_head(&sbi->cp_wait);
 	init_sb_info(sbi);
 
 	/* get an inode for meta space */
-- 
cgit v1.3


From 9076a75f8e0f23ab64e2f34eb6be144e81f00a3e Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Mon, 14 Oct 2013 18:47:11 +0800
Subject: f2fs: introduce function read_raw_super_block()

Introduce function read_raw_super_block() to hide reading raw super block and
the retry routine if the first sb is invalid.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/super.c | 55 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 34 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3b786c85b5cd..692f35fbae1a 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -746,30 +746,47 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 		atomic_set(&sbi->nr_pages[i], 0);
 }
 
-static int validate_superblock(struct super_block *sb,
-		struct f2fs_super_block **raw_super,
-		struct buffer_head **raw_super_buf, sector_t block)
+/*
+ * Read f2fs raw super block.
+ * Because we have two copies of super block, so read the first one at first,
+ * if the first one is invalid, move to read the second one.
+ */
+static int read_raw_super_block(struct super_block *sb,
+			struct f2fs_super_block **raw_super,
+			struct buffer_head **raw_super_buf)
 {
-	const char *super = (block == 0 ? "first" : "second");
+	int block = 0;
 
-	/* read f2fs raw super block */
+retry:
 	*raw_super_buf = sb_bread(sb, block);
 	if (!*raw_super_buf) {
-		f2fs_msg(sb, KERN_ERR, "unable to read %s superblock",
-				super);
-		return -EIO;
+		f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock",
+				block + 1);
+		if (block == 0) {
+			block++;
+			goto retry;
+		} else {
+			return -EIO;
+		}
 	}
 
 	*raw_super = (struct f2fs_super_block *)
 		((char *)(*raw_super_buf)->b_data + F2FS_SUPER_OFFSET);
 
 	/* sanity checking of raw super */
-	if (!sanity_check_raw_super(sb, *raw_super))
-		return 0;
+	if (sanity_check_raw_super(sb, *raw_super)) {
+		brelse(*raw_super_buf);
+		f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem "
+				"in %dth superblock", block + 1);
+		if(block == 0) {
+			block++;
+			goto retry;
+		} else {
+			return -EINVAL;
+		}
+	}
 
-	f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem "
-				"in %s superblock", super);
-	return -EINVAL;
+	return 0;
 }
 
 static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
@@ -791,14 +808,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 		goto free_sbi;
 	}
 
-	err = validate_superblock(sb, &raw_super, &raw_super_buf, 0);
-	if (err) {
-		brelse(raw_super_buf);
-		/* check secondary superblock when primary failed */
-		err = validate_superblock(sb, &raw_super, &raw_super_buf, 1);
-		if (err)
-			goto free_sb_buf;
-	}
+	err = read_raw_super_block(sb, &raw_super, &raw_super_buf);
+	if (err)
+		goto free_sbi;
+
 	sb->s_fs_info = sbi;
 	/* init some FS parameters */
 	sbi->active_logs = NR_CURSEG_TYPE;
-- 
cgit v1.3


From e234088758fca3a669ebb1a02d8bf7bf60f0e4ff Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Mon, 14 Oct 2013 18:45:56 +0800
Subject: f2fs: avoid wait if IO end up when do_checkpoint for better
 performance

Previously, do_checkpoint() will call congestion_wait() for waiting the pages
(previous submitted node/meta/data pages) to be written back.
Because congestion_wait() will set a regular period (e.g. HZ / 50 ) for waiting, and
no additional wake up mechanism was introduced if IO ends up before regular period costed.
Yuan Zhong found there is a situation that after the pages have been written back,
but the checkpoint thread still wait for congestion_wait to exit.

So here we store checkpoint task into f2fs_sb when doing checkpoint, it'll wait for IO completes
if there's IO going on, and in the end IO path, wake up checkpoint task when IO ends up.

Thanks to Yuan Zhong's pre work about this problem.

Reported-by: Yuan Zhong <yuan.mark.zhong@samsung.com>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 11 +++++++++--
 fs/f2fs/f2fs.h       |  1 +
 fs/f2fs/segment.c    |  4 ++++
 3 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index d80882763ffd..2a5999d865b2 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -757,8 +757,15 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 	f2fs_put_page(cp_page, 1);
 
 	/* wait for previous submitted node/meta pages writeback */
-	while (get_pages(sbi, F2FS_WRITEBACK))
-		congestion_wait(BLK_RW_ASYNC, HZ / 50);
+	sbi->cp_task = current;
+	while (get_pages(sbi, F2FS_WRITEBACK)) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!get_pages(sbi, F2FS_WRITEBACK))
+			break;
+		io_schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+	sbi->cp_task = NULL;
 
 	filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX);
 	filemap_fdatawait_range(sbi->meta_inode->i_mapping, 0, LONG_MAX);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 308967b71674..171c52fc95bb 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -362,6 +362,7 @@ struct f2fs_sb_info {
 	struct mutex writepages;		/* mutex for writepages() */
 	int por_doing;				/* recovery is doing or not */
 	int on_build_free_nids;			/* build_free_nids is doing */
+	struct task_struct *cp_task;		/* checkpoint task */
 
 	/* for orphan inode management */
 	struct list_head orphan_inode_list;	/* orphan inode list */
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index bd79bbeceb1d..3b203597c744 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -597,6 +597,10 @@ static void f2fs_end_io_write(struct bio *bio, int err)
 
 	if (p->is_sync)
 		complete(p->wait);
+
+	if (!get_pages(p->sbi, F2FS_WRITEBACK) && p->sbi->cp_task)
+		wake_up_process(p->sbi->cp_task);
+
 	kfree(p);
 	bio_put(bio);
 }
-- 
cgit v1.3


From 87a9bd265678ec3cc8431b14bcb14c68d0f94032 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Wed, 16 Oct 2013 15:09:26 +0900
Subject: f2fs: avoid to write during the recovery

This patch enhances the recovery routine not to write any data/node/meta until
its completion.
If any writes are sent to the disk, it could contaminate the written history
that will be used for further recovery.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c |  2 +-
 fs/f2fs/node.c       | 17 +++++++++++------
 2 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 2a5999d865b2..8d16071a3b23 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -81,7 +81,7 @@ static int f2fs_write_meta_page(struct page *page,
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 
 	/* Should not write any meta pages, if any IO error was occurred */
-	if (wbc->for_reclaim ||
+	if (wbc->for_reclaim || sbi->por_doing ||
 			is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)) {
 		dec_page_count(sbi, F2FS_DIRTY_META);
 		wbc->pages_skipped++;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 51ef27894433..ef80f791b02b 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1156,6 +1156,9 @@ static int f2fs_write_node_page(struct page *page,
 	block_t new_addr;
 	struct node_info ni;
 
+	if (sbi->por_doing)
+		goto redirty_out;
+
 	wait_on_page_writeback(page);
 
 	/* get old block addr of this node page */
@@ -1171,12 +1174,8 @@ static int f2fs_write_node_page(struct page *page,
 		return 0;
 	}
 
-	if (wbc->for_reclaim) {
-		dec_page_count(sbi, F2FS_DIRTY_NODES);
-		wbc->pages_skipped++;
-		set_page_dirty(page);
-		return AOP_WRITEPAGE_ACTIVATE;
-	}
+	if (wbc->for_reclaim)
+		goto redirty_out;
 
 	mutex_lock(&sbi->node_write);
 	set_page_writeback(page);
@@ -1186,6 +1185,12 @@ static int f2fs_write_node_page(struct page *page,
 	mutex_unlock(&sbi->node_write);
 	unlock_page(page);
 	return 0;
+
+redirty_out:
+	dec_page_count(sbi, F2FS_DIRTY_NODES);
+	wbc->pages_skipped++;
+	set_page_dirty(page);
+	return AOP_WRITEPAGE_ACTIVATE;
 }
 
 /*
-- 
cgit v1.3


From efbed4dc5857f845d787e406ce85097d1ccc5c4f Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 17 Oct 2013 21:11:01 -0400
Subject: ext4: add ratelimiting to ext4 messages

In the case of a storage device that suddenly disappears, or in the
case of significant file system corruption, this can result in a huge
flood of messages being sent to the console.  This can overflow the
file system containing /var/log/messages, or if a serial console is
configured, this can slow down the system so much that a hardware
watchdog can end up triggering forcing a system reboot.

Google-Bug-Id: 7258357

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  |   6 +++
 fs/ext4/super.c | 152 +++++++++++++++++++++++++++++++++++---------------------
 2 files changed, 100 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index af815ea9d7cc..65485ab7a889 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,6 +29,7 @@
 #include <linux/wait.h>
 #include <linux/blockgroup_lock.h>
 #include <linux/percpu_counter.h>
+#include <linux/ratelimit.h>
 #include <crypto/hash.h>
 #ifdef __KERNEL__
 #include <linux/compat.h>
@@ -1314,6 +1315,11 @@ struct ext4_sb_info {
 	unsigned long s_es_last_sorted;
 	struct percpu_counter s_extent_cache_cnt;
 	spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
+
+	/* Ratelimit ext4 messages. */
+	struct ratelimit_state s_err_ratelimit_state;
+	struct ratelimit_state s_warning_ratelimit_state;
+	struct ratelimit_state s_msg_ratelimit_state;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2c2e6cbc6bed..d3a857bfae47 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -411,20 +411,26 @@ static void ext4_handle_error(struct super_block *sb)
 			sb->s_id);
 }
 
+#define ext4_error_ratelimit(sb)					\
+		___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state),	\
+			     "EXT4-fs error")
+
 void __ext4_error(struct super_block *sb, const char *function,
 		  unsigned int line, const char *fmt, ...)
 {
 	struct va_format vaf;
 	va_list args;
 
-	va_start(args, fmt);
-	vaf.fmt = fmt;
-	vaf.va = &args;
-	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
-	       sb->s_id, function, line, current->comm, &vaf);
-	va_end(args);
+	if (ext4_error_ratelimit(sb)) {
+		va_start(args, fmt);
+		vaf.fmt = fmt;
+		vaf.va = &args;
+		printk(KERN_CRIT
+		       "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
+		       sb->s_id, function, line, current->comm, &vaf);
+		va_end(args);
+	}
 	save_error_info(sb, function, line);
-
 	ext4_handle_error(sb);
 }
 
@@ -438,22 +444,23 @@ void __ext4_error_inode(struct inode *inode, const char *function,
 
 	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
 	es->s_last_error_block = cpu_to_le64(block);
+	if (ext4_error_ratelimit(inode->i_sb)) {
+		va_start(args, fmt);
+		vaf.fmt = fmt;
+		vaf.va = &args;
+		if (block)
+			printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
+			       "inode #%lu: block %llu: comm %s: %pV\n",
+			       inode->i_sb->s_id, function, line, inode->i_ino,
+			       block, current->comm, &vaf);
+		else
+			printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
+			       "inode #%lu: comm %s: %pV\n",
+			       inode->i_sb->s_id, function, line, inode->i_ino,
+			       current->comm, &vaf);
+		va_end(args);
+	}
 	save_error_info(inode->i_sb, function, line);
-	va_start(args, fmt);
-	vaf.fmt = fmt;
-	vaf.va = &args;
-	if (block)
-		printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
-		       "inode #%lu: block %llu: comm %s: %pV\n",
-		       inode->i_sb->s_id, function, line, inode->i_ino,
-		       block, current->comm, &vaf);
-	else
-		printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
-		       "inode #%lu: comm %s: %pV\n",
-		       inode->i_sb->s_id, function, line, inode->i_ino,
-		       current->comm, &vaf);
-	va_end(args);
-
 	ext4_handle_error(inode->i_sb);
 }
 
@@ -469,27 +476,28 @@ void __ext4_error_file(struct file *file, const char *function,
 
 	es = EXT4_SB(inode->i_sb)->s_es;
 	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
+	if (ext4_error_ratelimit(inode->i_sb)) {
+		path = d_path(&(file->f_path), pathname, sizeof(pathname));
+		if (IS_ERR(path))
+			path = "(unknown)";
+		va_start(args, fmt);
+		vaf.fmt = fmt;
+		vaf.va = &args;
+		if (block)
+			printk(KERN_CRIT
+			       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+			       "block %llu: comm %s: path %s: %pV\n",
+			       inode->i_sb->s_id, function, line, inode->i_ino,
+			       block, current->comm, path, &vaf);
+		else
+			printk(KERN_CRIT
+			       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+			       "comm %s: path %s: %pV\n",
+			       inode->i_sb->s_id, function, line, inode->i_ino,
+			       current->comm, path, &vaf);
+		va_end(args);
+	}
 	save_error_info(inode->i_sb, function, line);
-	path = d_path(&(file->f_path), pathname, sizeof(pathname));
-	if (IS_ERR(path))
-		path = "(unknown)";
-	va_start(args, fmt);
-	vaf.fmt = fmt;
-	vaf.va = &args;
-	if (block)
-		printk(KERN_CRIT
-		       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
-		       "block %llu: comm %s: path %s: %pV\n",
-		       inode->i_sb->s_id, function, line, inode->i_ino,
-		       block, current->comm, path, &vaf);
-	else
-		printk(KERN_CRIT
-		       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
-		       "comm %s: path %s: %pV\n",
-		       inode->i_sb->s_id, function, line, inode->i_ino,
-		       current->comm, path, &vaf);
-	va_end(args);
-
 	ext4_handle_error(inode->i_sb);
 }
 
@@ -543,11 +551,13 @@ void __ext4_std_error(struct super_block *sb, const char *function,
 	    (sb->s_flags & MS_RDONLY))
 		return;
 
-	errstr = ext4_decode_error(sb, errno, nbuf);
-	printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
-	       sb->s_id, function, line, errstr);
-	save_error_info(sb, function, line);
+	if (ext4_error_ratelimit(sb)) {
+		errstr = ext4_decode_error(sb, errno, nbuf);
+		printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
+		       sb->s_id, function, line, errstr);
+	}
 
+	save_error_info(sb, function, line);
 	ext4_handle_error(sb);
 }
 
@@ -597,6 +607,9 @@ void __ext4_msg(struct super_block *sb,
 	struct va_format vaf;
 	va_list args;
 
+	if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs"))
+		return;
+
 	va_start(args, fmt);
 	vaf.fmt = fmt;
 	vaf.va = &args;
@@ -610,6 +623,10 @@ void __ext4_warning(struct super_block *sb, const char *function,
 	struct va_format vaf;
 	va_list args;
 
+	if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
+			  "EXT4-fs warning"))
+		return;
+
 	va_start(args, fmt);
 	vaf.fmt = fmt;
 	vaf.va = &args;
@@ -633,18 +650,20 @@ __acquires(bitlock)
 	es->s_last_error_block = cpu_to_le64(block);
 	__save_error_info(sb, function, line);
 
-	va_start(args, fmt);
-
-	vaf.fmt = fmt;
-	vaf.va = &args;
-	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
-	       sb->s_id, function, line, grp);
-	if (ino)
-		printk(KERN_CONT "inode %lu: ", ino);
-	if (block)
-		printk(KERN_CONT "block %llu:", (unsigned long long) block);
-	printk(KERN_CONT "%pV\n", &vaf);
-	va_end(args);
+	if (ext4_error_ratelimit(sb)) {
+		va_start(args, fmt);
+		vaf.fmt = fmt;
+		vaf.va = &args;
+		printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
+		       sb->s_id, function, line, grp);
+		if (ino)
+			printk(KERN_CONT "inode %lu: ", ino);
+		if (block)
+			printk(KERN_CONT "block %llu:",
+			       (unsigned long long) block);
+		printk(KERN_CONT "%pV\n", &vaf);
+		va_end(args);
+	}
 
 	if (test_opt(sb, ERRORS_CONT)) {
 		ext4_commit_super(sb, 0);
@@ -2606,6 +2625,12 @@ EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
 EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
 EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
+EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
 
 static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(delayed_allocation_blocks),
@@ -2623,6 +2648,12 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(max_writeback_mb_bump),
 	ATTR_LIST(extent_max_zeroout_kb),
 	ATTR_LIST(trigger_fs_error),
+	ATTR_LIST(err_ratelimit_interval_ms),
+	ATTR_LIST(err_ratelimit_burst),
+	ATTR_LIST(warning_ratelimit_interval_ms),
+	ATTR_LIST(warning_ratelimit_burst),
+	ATTR_LIST(msg_ratelimit_interval_ms),
+	ATTR_LIST(msg_ratelimit_burst),
 	NULL,
 };
 
@@ -4118,6 +4149,11 @@ no_journal:
 	if (es->s_error_count)
 		mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
 
+	/* Enable message ratelimiting. Default is 10 messages per 5 secs. */
+	ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
+	ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
+	ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
+
 	kfree(orig_data);
 	return 0;
 
-- 
cgit v1.3


From 444996027e80f96c9948d8d5b8f067a8aff9861d Mon Sep 17 00:00:00 2001
From: "Peter A. Felvegi" <petschy@gmail.com>
Date: Fri, 18 Oct 2013 20:07:44 +0200
Subject: udf: fix for pathetic mount times in case of invalid file system

The UDF driver was not strict enough about checking the IDs in the
VSDs when mounting, which resulted in reading through all the sectors
of the block device in some unfortunate cases. Eg, trying to mount my
uninitialized 200G SSD partition (all 0xFF bytes) took ~350 minutes to
fail, because the code expected some of the valid IDs or a zero byte.
During this, the mount couldn't be killed, sync from the cmdline
blocked, and the machine froze into the shutdown. Valid filesystems
(extX, btrfs, ntfs) were rejected by the mere accident of having a
zero byte at just the right place in some of their sectors, close
enough to the beginning not to generate excess I/O. The fix adds a
hard limit on the VSD sector offset, adds the two missing VSD IDs, and
stops scanning when encountering an invalid ID. Also replaced the
magic number 32768 with a more meaningful #define, and supressed the
bogus message about failing to read the first sector if no UDF fs was
detected.

Signed-off-by: Peter A. Felvegi <petschy@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c | 45 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index 91219385691d..3306b9f69bed 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -76,6 +76,9 @@
 
 #define UDF_DEFAULT_BLOCKSIZE 2048
 
+#define VSD_FIRST_SECTOR_OFFSET		32768
+#define VSD_MAX_SECTOR_OFFSET		0x800000
+
 enum { UDF_MAX_LINKS = 0xffff };
 
 /* These are the "meat" - everything else is stuffing */
@@ -685,7 +688,7 @@ out_unlock:
 static loff_t udf_check_vsd(struct super_block *sb)
 {
 	struct volStructDesc *vsd = NULL;
-	loff_t sector = 32768;
+	loff_t sector = VSD_FIRST_SECTOR_OFFSET;
 	int sectorsize;
 	struct buffer_head *bh = NULL;
 	int nsr02 = 0;
@@ -703,8 +706,18 @@ static loff_t udf_check_vsd(struct super_block *sb)
 	udf_debug("Starting at sector %u (%ld byte sectors)\n",
 		  (unsigned int)(sector >> sb->s_blocksize_bits),
 		  sb->s_blocksize);
-	/* Process the sequence (if applicable) */
-	for (; !nsr02 && !nsr03; sector += sectorsize) {
+	/* Process the sequence (if applicable). The hard limit on the sector
+	 * offset is arbitrary, hopefully large enough so that all valid UDF
+	 * filesystems will be recognised. There is no mention of an upper
+	 * bound to the size of the volume recognition area in the standard.
+	 *  The limit will prevent the code to read all the sectors of a
+	 * specially crafted image (like a bluray disc full of CD001 sectors),
+	 * potentially causing minutes or even hours of uninterruptible I/O
+	 * activity. This actually happened with uninitialised SSD partitions
+	 * (all 0xFF) before the check for the limit and all valid IDs were
+	 * added */
+	for (; !nsr02 && !nsr03 && sector < VSD_MAX_SECTOR_OFFSET;
+	     sector += sectorsize) {
 		/* Read a block */
 		bh = udf_tread(sb, sector >> sb->s_blocksize_bits);
 		if (!bh)
@@ -714,10 +727,7 @@ static loff_t udf_check_vsd(struct super_block *sb)
 		vsd = (struct volStructDesc *)(bh->b_data +
 					      (sector & (sb->s_blocksize - 1)));
 
-		if (vsd->stdIdent[0] == 0) {
-			brelse(bh);
-			break;
-		} else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001,
+		if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001,
 				    VSD_STD_ID_LEN)) {
 			switch (vsd->structType) {
 			case 0:
@@ -753,6 +763,17 @@ static loff_t udf_check_vsd(struct super_block *sb)
 		else if (!strncmp(vsd->stdIdent, VSD_STD_ID_NSR03,
 				    VSD_STD_ID_LEN))
 			nsr03 = sector;
+		else if (!strncmp(vsd->stdIdent, VSD_STD_ID_BOOT2,
+				    VSD_STD_ID_LEN))
+			; /* nothing */
+		else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CDW02,
+				    VSD_STD_ID_LEN))
+			; /* nothing */
+		else {
+			/* invalid id : end of volume recognition area */
+			brelse(bh);
+			break;
+		}
 		brelse(bh);
 	}
 
@@ -760,7 +781,8 @@ static loff_t udf_check_vsd(struct super_block *sb)
 		return nsr03;
 	else if (nsr02)
 		return nsr02;
-	else if (sector - (sbi->s_session << sb->s_blocksize_bits) == 32768)
+	else if (!bh && sector - (sbi->s_session << sb->s_blocksize_bits) ==
+			VSD_FIRST_SECTOR_OFFSET)
 		return -1;
 	else
 		return 0;
@@ -1270,6 +1292,9 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
 	 * PHYSICAL partitions are already set up
 	 */
 	type1_idx = i;
+#ifdef UDFFS_DEBUG
+	map = NULL; /* supress 'maybe used uninitialized' warning */
+#endif
 	for (i = 0; i < sbi->s_partitions; i++) {
 		map = &sbi->s_partmaps[i];
 
@@ -1891,7 +1916,9 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
 			return 0;
 		}
 		if (nsr_off == -1)
-			udf_debug("Failed to read byte 32768. Assuming open disc. Skipping validity check\n");
+			udf_debug("Failed to read sector at offset %d. "
+				  "Assuming open disc. Skipping validity "
+				  "check\n", VSD_FIRST_SECTOR_OFFSET);
 		if (!sbi->s_last_block)
 			sbi->s_last_block = udf_get_last_block(sb);
 	} else {
-- 
cgit v1.3


From 76ca4c238cf5858f8ba5bcd24c94c23717f1e14e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 14 Oct 2013 07:09:35 -0700
Subject: xfs: always take the iolock around xfs_setattr_size

There is no reason to conditionally take the iolock inside xfs_setattr_size
when we can let the caller handle it unconditionally, which just incrases
the lock hold time for the case where it was previously taken internally
by a few instructions.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_bmap_util.c |  3 +--
 fs/xfs/xfs_file.c      |  2 +-
 fs/xfs/xfs_iops.c      | 39 +++++++++++++++++++++------------------
 fs/xfs/xfs_iops.h      |  2 +-
 4 files changed, 24 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 97f952caea74..291e30cedfd3 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1622,8 +1622,7 @@ xfs_change_file_space(
 		iattr.ia_valid = ATTR_SIZE;
 		iattr.ia_size = startoffset;
 
-		error = xfs_setattr_size(ip, &iattr,
-					 attr_flags | XFS_ATTR_NOLOCK);
+		error = xfs_setattr_size(ip, &iattr);
 		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 
 		if (error)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 4c749ab543d0..c9179820c01b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -852,7 +852,7 @@ xfs_file_fallocate(
 
 		iattr.ia_valid = ATTR_SIZE;
 		iattr.ia_size = new_size;
-		error = -xfs_setattr_size(ip, &iattr, XFS_ATTR_NOLOCK);
+		error = -xfs_setattr_size(ip, &iattr);
 	}
 
 out_unlock:
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 2b8952d9199b..52ecd1ccec9f 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -709,8 +709,7 @@ out_dqrele:
 int
 xfs_setattr_size(
 	struct xfs_inode	*ip,
-	struct iattr		*iattr,
-	int			flags)
+	struct iattr		*iattr)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct inode		*inode = VFS_I(ip);
@@ -733,15 +732,11 @@ xfs_setattr_size(
 	if (error)
 		return XFS_ERROR(error);
 
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
 	ASSERT(S_ISREG(ip->i_d.di_mode));
 	ASSERT((mask & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
 			ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
 
-	if (!(flags & XFS_ATTR_NOLOCK)) {
-		lock_flags |= XFS_IOLOCK_EXCL;
-		xfs_ilock(ip, lock_flags);
-	}
-
 	oldsize = inode->i_size;
 	newsize = iattr->ia_size;
 
@@ -750,12 +745,11 @@ xfs_setattr_size(
 	 */
 	if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
 		if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
-			goto out_unlock;
+			return 0;
 
 		/*
 		 * Use the regular setattr path to update the timestamps.
 		 */
-		xfs_iunlock(ip, lock_flags);
 		iattr->ia_valid &= ~ATTR_SIZE;
 		return xfs_setattr_nonsize(ip, iattr, 0);
 	}
@@ -765,7 +759,7 @@ xfs_setattr_size(
 	 */
 	error = xfs_qm_dqattach(ip, 0);
 	if (error)
-		goto out_unlock;
+		return error;
 
 	/*
 	 * Now we can make the changes.  Before we join the inode to the
@@ -783,7 +777,7 @@ xfs_setattr_size(
 		 */
 		error = xfs_zero_eof(ip, newsize, oldsize);
 		if (error)
-			goto out_unlock;
+			return error;
 	}
 
 	/*
@@ -802,7 +796,7 @@ xfs_setattr_size(
 		error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
 						      ip->i_d.di_size, newsize);
 		if (error)
-			goto out_unlock;
+			return error;
 	}
 
 	/*
@@ -812,7 +806,7 @@ xfs_setattr_size(
 
 	error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
 	if (error)
-		goto out_unlock;
+		return error;
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
@@ -916,12 +910,21 @@ out_trans_cancel:
 
 STATIC int
 xfs_vn_setattr(
-	struct dentry	*dentry,
-	struct iattr	*iattr)
+	struct dentry		*dentry,
+	struct iattr		*iattr)
 {
-	if (iattr->ia_valid & ATTR_SIZE)
-		return -xfs_setattr_size(XFS_I(dentry->d_inode), iattr, 0);
-	return -xfs_setattr_nonsize(XFS_I(dentry->d_inode), iattr, 0);
+	struct xfs_inode	*ip = XFS_I(dentry->d_inode);
+	int			error;
+
+	if (iattr->ia_valid & ATTR_SIZE) {
+		xfs_ilock(ip, XFS_IOLOCK_EXCL);
+		error = xfs_setattr_size(ip, iattr);
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	} else {
+		error = xfs_setattr_nonsize(ip, iattr, 0);
+	}
+
+	return -error;
 }
 
 STATIC int
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index d81fb41205ec..3e76dcba7a79 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -38,6 +38,6 @@ extern void xfs_setup_inode(struct xfs_inode *);
 
 extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
 			       int flags);
-extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap, int flags);
+extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap);
 
 #endif /* __XFS_IOPS_H__ */
-- 
cgit v1.3


From 001a3e73706fcb8c52dcca1d4742679384d04184 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 12 Oct 2013 00:55:05 -0700
Subject: xfs: remove the unused XFS_ATTR_NONBLOCK flag

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_ioctl.c | 3 ---
 fs/xfs/xfs_iops.h  | 1 -
 2 files changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 668e8f4ccf5e..5a57e0c80b6b 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -661,9 +661,6 @@ xfs_ioc_space(
 	if (!S_ISREG(inode->i_mode))
 		return -XFS_ERROR(EINVAL);
 
-	if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-		attr_flags |= XFS_ATTR_NONBLOCK;
-
 	if (filp->f_flags & O_DSYNC)
 		attr_flags |= XFS_ATTR_SYNC;
 
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index 3e76dcba7a79..fe4013af2586 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -31,7 +31,6 @@ extern void xfs_setup_inode(struct xfs_inode *);
  * Internal setattr interfaces.
  */
 #define	XFS_ATTR_DMI		0x01	/* invocation from a DMI function */
-#define	XFS_ATTR_NONBLOCK	0x02	/* return EAGAIN if op would block */
 #define XFS_ATTR_NOLOCK		0x04	/* Don't grab any conflicting locks */
 #define XFS_ATTR_NOACL		0x08	/* Don't call xfs_acl_chmod */
 #define XFS_ATTR_SYNC		0x10	/* synchronous operation required */
-- 
cgit v1.3


From 5f8aca8b43f41c4b2d4dc5389124a40cab92b644 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 12 Oct 2013 00:55:06 -0700
Subject: xfs: always hold the iolock when calling xfs_change_file_space

Currently fallocate always holds the iolock when calling into
xfs_change_file_space, while the ioctl path lets some of the lower level
functions take it, but leave it out in others.

This patch makes sure the ioctl path also always holds the iolock and
thus introduces consistent locking for the preallocation operations while
simplifying the code and allowing to kill the now unused XFS_ATTR_NOLOCK
flag.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_bmap_util.c | 71 +++++++++++++++++---------------------------------
 fs/xfs/xfs_file.c      |  2 +-
 fs/xfs/xfs_ioctl.c     |  2 ++
 fs/xfs/xfs_iops.h      |  1 -
 4 files changed, 27 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 291e30cedfd3..bdd552d83107 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -989,8 +989,7 @@ xfs_alloc_file_space(
 	xfs_inode_t		*ip,
 	xfs_off_t		offset,
 	xfs_off_t		len,
-	int			alloc_type,
-	int			attr_flags)
+	int			alloc_type)
 {
 	xfs_mount_t		*mp = ip->i_mount;
 	xfs_off_t		count;
@@ -1248,8 +1247,7 @@ STATIC int
 xfs_free_file_space(
 	xfs_inode_t		*ip,
 	xfs_off_t		offset,
-	xfs_off_t		len,
-	int			attr_flags)
+	xfs_off_t		len)
 {
 	int			committed;
 	int			done;
@@ -1267,7 +1265,6 @@ xfs_free_file_space(
 	int			rt;
 	xfs_fileoff_t		startoffset_fsb;
 	xfs_trans_t		*tp;
-	int			need_iolock = 1;
 
 	mp = ip->i_mount;
 
@@ -1284,20 +1281,15 @@ xfs_free_file_space(
 	startoffset_fsb	= XFS_B_TO_FSB(mp, offset);
 	endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
 
-	if (attr_flags & XFS_ATTR_NOLOCK)
-		need_iolock = 0;
-	if (need_iolock) {
-		xfs_ilock(ip, XFS_IOLOCK_EXCL);
-		/* wait for the completion of any pending DIOs */
-		inode_dio_wait(VFS_I(ip));
-	}
+	/* wait for the completion of any pending DIOs */
+	inode_dio_wait(VFS_I(ip));
 
 	rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
 	ioffset = offset & ~(rounding - 1);
 	error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
 					      ioffset, -1);
 	if (error)
-		goto out_unlock_iolock;
+		goto out;
 	truncate_pagecache_range(VFS_I(ip), ioffset, -1);
 
 	/*
@@ -1311,7 +1303,7 @@ xfs_free_file_space(
 		error = xfs_bmapi_read(ip, startoffset_fsb, 1,
 					&imap, &nimap, 0);
 		if (error)
-			goto out_unlock_iolock;
+			goto out;
 		ASSERT(nimap == 0 || nimap == 1);
 		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
 			xfs_daddr_t	block;
@@ -1326,7 +1318,7 @@ xfs_free_file_space(
 		error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
 					&imap, &nimap, 0);
 		if (error)
-			goto out_unlock_iolock;
+			goto out;
 		ASSERT(nimap == 0 || nimap == 1);
 		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
 			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
@@ -1412,18 +1404,15 @@ xfs_free_file_space(
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	}
 
- out_unlock_iolock:
-	if (need_iolock)
-		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ out:
 	return error;
 
  error0:
 	xfs_bmap_cancel(&free_list);
  error1:
 	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-	xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
-		    XFS_ILOCK_EXCL);
-	return error;
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	goto out;
 }
 
 
@@ -1431,8 +1420,7 @@ STATIC int
 xfs_zero_file_space(
 	struct xfs_inode	*ip,
 	xfs_off_t		offset,
-	xfs_off_t		len,
-	int			attr_flags)
+	xfs_off_t		len)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	uint			granularity;
@@ -1453,9 +1441,6 @@ xfs_zero_file_space(
 	ASSERT(start_boundary >= offset);
 	ASSERT(end_boundary <= offset + len);
 
-	if (!(attr_flags & XFS_ATTR_NOLOCK))
-		xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
 	if (start_boundary < end_boundary - 1) {
 		/* punch out the page cache over the conversion range */
 		truncate_pagecache_range(VFS_I(ip), start_boundary,
@@ -1463,16 +1448,16 @@ xfs_zero_file_space(
 		/* convert the blocks */
 		error = xfs_alloc_file_space(ip, start_boundary,
 					end_boundary - start_boundary - 1,
-					XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT,
-					attr_flags);
+					XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT);
 		if (error)
-			goto out_unlock;
+			goto out;
 
 		/* We've handled the interior of the range, now for the edges */
-		if (start_boundary != offset)
+		if (start_boundary != offset) {
 			error = xfs_iozero(ip, offset, start_boundary - offset);
-		if (error)
-			goto out_unlock;
+			if (error)
+				goto out;
+		}
 
 		if (end_boundary != offset + len)
 			error = xfs_iozero(ip, end_boundary,
@@ -1486,9 +1471,7 @@ xfs_zero_file_space(
 		error = xfs_iozero(ip, offset, len);
 	}
 
-out_unlock:
-	if (!(attr_flags & XFS_ATTR_NOLOCK))
-		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+out:
 	return error;
 
 }
@@ -1571,8 +1554,7 @@ xfs_change_file_space(
 	setprealloc = clrprealloc = 0;
 	switch (cmd) {
 	case XFS_IOC_ZERO_RANGE:
-		error = xfs_zero_file_space(ip, startoffset, bf->l_len,
-						attr_flags);
+		error = xfs_zero_file_space(ip, startoffset, bf->l_len);
 		if (error)
 			return error;
 		setprealloc = 1;
@@ -1581,7 +1563,7 @@ xfs_change_file_space(
 	case XFS_IOC_RESVSP:
 	case XFS_IOC_RESVSP64:
 		error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
-						XFS_BMAPI_PREALLOC, attr_flags);
+						XFS_BMAPI_PREALLOC);
 		if (error)
 			return error;
 		setprealloc = 1;
@@ -1589,8 +1571,8 @@ xfs_change_file_space(
 
 	case XFS_IOC_UNRESVSP:
 	case XFS_IOC_UNRESVSP64:
-		if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
-								attr_flags)))
+		error = xfs_free_file_space(ip, startoffset, bf->l_len);
+		if (error)
 			return error;
 		break;
 
@@ -1608,22 +1590,17 @@ xfs_change_file_space(
 		 * truncate, direct IO) from racing against the transient
 		 * allocated but not written state we can have here.
 		 */
-		xfs_ilock(ip, XFS_IOLOCK_EXCL);
 		if (startoffset > fsize) {
 			error = xfs_alloc_file_space(ip, fsize,
-					startoffset - fsize, 0,
-					attr_flags | XFS_ATTR_NOLOCK);
-			if (error) {
-				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+					startoffset - fsize, 0);
+			if (error)
 				break;
-			}
 		}
 
 		iattr.ia_valid = ATTR_SIZE;
 		iattr.ia_size = startoffset;
 
 		error = xfs_setattr_size(ip, &iattr);
-		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 
 		if (error)
 			return error;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c9179820c01b..116300f3b1d4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -816,7 +816,7 @@ xfs_file_fallocate(
 	xfs_flock64_t	bf;
 	xfs_inode_t	*ip = XFS_I(inode);
 	int		cmd = XFS_IOC_RESVSP;
-	int		attr_flags = XFS_ATTR_NOLOCK;
+	int		attr_flags = 0;
 
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
 		return -EOPNOTSUPP;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 5a57e0c80b6b..45287419dc37 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -670,7 +670,9 @@ xfs_ioc_space(
 	error = mnt_want_write_file(filp);
 	if (error)
 		return error;
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
 	error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags);
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 	mnt_drop_write_file(filp);
 	return -error;
 }
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index fe4013af2586..f3738253c69a 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -31,7 +31,6 @@ extern void xfs_setup_inode(struct xfs_inode *);
  * Internal setattr interfaces.
  */
 #define	XFS_ATTR_DMI		0x01	/* invocation from a DMI function */
-#define XFS_ATTR_NOLOCK		0x04	/* Don't grab any conflicting locks */
 #define XFS_ATTR_NOACL		0x08	/* Don't call xfs_acl_chmod */
 #define XFS_ATTR_SYNC		0x10	/* synchronous operation required */
 
-- 
cgit v1.3


From 83aee9e4c2976143f35b3a42ad1faadf58c53ae7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 12 Oct 2013 00:55:07 -0700
Subject: xfs: simplify the fallocate path

Call xfs_alloc_file_space or xfs_free_file_space directly from
xfs_file_fallocate instead of going through xfs_change_file_space.

This simplified the code by removing the unessecary marshalling of the
arguments into an xfs_flock64_t structure and allows removing checks that
are already done in the VFS code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_bmap_util.c | 39 +++-----------------------
 fs/xfs/xfs_bmap_util.h |  4 +++
 fs/xfs/xfs_file.c      | 76 +++++++++++++++++++++++++++++++-------------------
 3 files changed, 56 insertions(+), 63 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index bdd552d83107..7e1c2ae81c35 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -965,28 +965,9 @@ xfs_free_eofblocks(
 	return error;
 }
 
-/*
- * xfs_alloc_file_space()
- *      This routine allocates disk space for the given file.
- *
- *	If alloc_type == 0, this request is for an ALLOCSP type
- *	request which will change the file size.  In this case, no
- *	DMAPI event will be generated by the call.  A TRUNCATE event
- *	will be generated later by xfs_setattr.
- *
- *	If alloc_type != 0, this request is for a RESVSP type
- *	request, and a DMAPI DM_EVENT_WRITE will be generated if the
- *	lower block boundary byte address is less than the file's
- *	length.
- *
- * RETURNS:
- *       0 on success
- *      errno on error
- *
- */
-STATIC int
+int
 xfs_alloc_file_space(
-	xfs_inode_t		*ip,
+	struct xfs_inode	*ip,
 	xfs_off_t		offset,
 	xfs_off_t		len,
 	int			alloc_type)
@@ -1231,21 +1212,9 @@ xfs_zero_remaining_bytes(
 	return error;
 }
 
-/*
- * xfs_free_file_space()
- *      This routine frees disk space for the given file.
- *
- *	This routine is only called by xfs_change_file_space
- *	for an UNRESVSP type call.
- *
- * RETURNS:
- *       0 on success
- *      errno on error
- *
- */
-STATIC int
+int
 xfs_free_file_space(
-	xfs_inode_t		*ip,
+	struct xfs_inode	*ip,
 	xfs_off_t		offset,
 	xfs_off_t		len)
 {
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 061260946f7a..77cf5001719b 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -96,6 +96,10 @@ int	xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 int	xfs_change_file_space(struct xfs_inode *ip, int cmd,
 			      xfs_flock64_t *bf, xfs_off_t offset,
 			      int attr_flags);
+int	xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
+			     xfs_off_t len, int alloc_type);
+int	xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
+			    xfs_off_t len);
 
 /* EOF block manipulation functions */
 bool	xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 116300f3b1d4..c8a75a75e25a 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -805,44 +805,64 @@ out:
 
 STATIC long
 xfs_file_fallocate(
-	struct file	*file,
-	int		mode,
-	loff_t		offset,
-	loff_t		len)
+	struct file		*file,
+	int			mode,
+	loff_t			offset,
+	loff_t			len)
 {
-	struct inode	*inode = file_inode(file);
-	long		error;
-	loff_t		new_size = 0;
-	xfs_flock64_t	bf;
-	xfs_inode_t	*ip = XFS_I(inode);
-	int		cmd = XFS_IOC_RESVSP;
-	int		attr_flags = 0;
+	struct inode		*inode = file_inode(file);
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_trans	*tp;
+	long			error;
+	loff_t			new_size = 0;
 
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
 		return -EOPNOTSUPP;
 
-	bf.l_whence = 0;
-	bf.l_start = offset;
-	bf.l_len = len;
-
 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+	if (mode & FALLOC_FL_PUNCH_HOLE) {
+		error = xfs_free_file_space(ip, offset, len);
+		if (error)
+			goto out_unlock;
+	} else {
+		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+		    offset + len > i_size_read(inode)) {
+			new_size = offset + len;
+			error = -inode_newsize_ok(inode, new_size);
+			if (error)
+				goto out_unlock;
+		}
 
-	if (mode & FALLOC_FL_PUNCH_HOLE)
-		cmd = XFS_IOC_UNRESVSP;
-
-	/* check the new inode size is valid before allocating */
-	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-	    offset + len > i_size_read(inode)) {
-		new_size = offset + len;
-		error = inode_newsize_ok(inode, new_size);
+		error = xfs_alloc_file_space(ip, offset, len,
+					     XFS_BMAPI_PREALLOC);
 		if (error)
 			goto out_unlock;
 	}
 
-	if (file->f_flags & O_DSYNC)
-		attr_flags |= XFS_ATTR_SYNC;
+	tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
+	error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		goto out_unlock;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	ip->i_d.di_mode &= ~S_ISUID;
+	if (ip->i_d.di_mode & S_IXGRP)
+		ip->i_d.di_mode &= ~S_ISGID;
+
+	if (!(mode & FALLOC_FL_PUNCH_HOLE))
+		ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
+
+	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
-	error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags);
+	if (file->f_flags & O_DSYNC)
+		xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp, 0);
 	if (error)
 		goto out_unlock;
 
@@ -852,12 +872,12 @@ xfs_file_fallocate(
 
 		iattr.ia_valid = ATTR_SIZE;
 		iattr.ia_size = new_size;
-		error = -xfs_setattr_size(ip, &iattr);
+		error = xfs_setattr_size(ip, &iattr);
 	}
 
 out_unlock:
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-	return error;
+	return -error;
 }
 
 
-- 
cgit v1.3


From 865e9446b4c17f78be0b1387426394b6bfc278f3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 12 Oct 2013 00:55:08 -0700
Subject: xfs: fold xfs_change_file_space into xfs_ioc_space

Now that only one caller of xfs_change_file_space is left it can be merged
into said caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_bmap_util.c | 178 +------------------------------------------------
 fs/xfs/xfs_bmap_util.h |   5 +-
 fs/xfs/xfs_ioctl.c     | 130 +++++++++++++++++++++++++++++++++---
 fs/xfs/xfs_iops.h      |   4 +-
 4 files changed, 126 insertions(+), 191 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 7e1c2ae81c35..45cf5a2fd1bf 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1385,7 +1385,7 @@ xfs_free_file_space(
 }
 
 
-STATIC int
+int
 xfs_zero_file_space(
 	struct xfs_inode	*ip,
 	xfs_off_t		offset,
@@ -1445,182 +1445,6 @@ out:
 
 }
 
-/*
- * xfs_change_file_space()
- *      This routine allocates or frees disk space for the given file.
- *      The user specified parameters are checked for alignment and size
- *      limitations.
- *
- * RETURNS:
- *       0 on success
- *      errno on error
- *
- */
-int
-xfs_change_file_space(
-	xfs_inode_t	*ip,
-	int		cmd,
-	xfs_flock64_t	*bf,
-	xfs_off_t	offset,
-	int		attr_flags)
-{
-	xfs_mount_t	*mp = ip->i_mount;
-	int		clrprealloc;
-	int		error;
-	xfs_fsize_t	fsize;
-	int		setprealloc;
-	xfs_off_t	startoffset;
-	xfs_trans_t	*tp;
-	struct iattr	iattr;
-
-	if (!S_ISREG(ip->i_d.di_mode))
-		return XFS_ERROR(EINVAL);
-
-	switch (bf->l_whence) {
-	case 0: /*SEEK_SET*/
-		break;
-	case 1: /*SEEK_CUR*/
-		bf->l_start += offset;
-		break;
-	case 2: /*SEEK_END*/
-		bf->l_start += XFS_ISIZE(ip);
-		break;
-	default:
-		return XFS_ERROR(EINVAL);
-	}
-
-	/*
-	 * length of <= 0 for resv/unresv/zero is invalid.  length for
-	 * alloc/free is ignored completely and we have no idea what userspace
-	 * might have set it to, so set it to zero to allow range
-	 * checks to pass.
-	 */
-	switch (cmd) {
-	case XFS_IOC_ZERO_RANGE:
-	case XFS_IOC_RESVSP:
-	case XFS_IOC_RESVSP64:
-	case XFS_IOC_UNRESVSP:
-	case XFS_IOC_UNRESVSP64:
-		if (bf->l_len <= 0)
-			return XFS_ERROR(EINVAL);
-		break;
-	default:
-		bf->l_len = 0;
-		break;
-	}
-
-	if (bf->l_start < 0 ||
-	    bf->l_start > mp->m_super->s_maxbytes ||
-	    bf->l_start + bf->l_len < 0 ||
-	    bf->l_start + bf->l_len >= mp->m_super->s_maxbytes)
-		return XFS_ERROR(EINVAL);
-
-	bf->l_whence = 0;
-
-	startoffset = bf->l_start;
-	fsize = XFS_ISIZE(ip);
-
-	setprealloc = clrprealloc = 0;
-	switch (cmd) {
-	case XFS_IOC_ZERO_RANGE:
-		error = xfs_zero_file_space(ip, startoffset, bf->l_len);
-		if (error)
-			return error;
-		setprealloc = 1;
-		break;
-
-	case XFS_IOC_RESVSP:
-	case XFS_IOC_RESVSP64:
-		error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
-						XFS_BMAPI_PREALLOC);
-		if (error)
-			return error;
-		setprealloc = 1;
-		break;
-
-	case XFS_IOC_UNRESVSP:
-	case XFS_IOC_UNRESVSP64:
-		error = xfs_free_file_space(ip, startoffset, bf->l_len);
-		if (error)
-			return error;
-		break;
-
-	case XFS_IOC_ALLOCSP:
-	case XFS_IOC_ALLOCSP64:
-	case XFS_IOC_FREESP:
-	case XFS_IOC_FREESP64:
-		/*
-		 * These operations actually do IO when extending the file, but
-		 * the allocation is done seperately to the zeroing that is
-		 * done. This set of operations need to be serialised against
-		 * other IO operations, such as truncate and buffered IO. We
-		 * need to take the IOLOCK here to serialise the allocation and
-		 * zeroing IO to prevent other IOLOCK holders (e.g. getbmap,
-		 * truncate, direct IO) from racing against the transient
-		 * allocated but not written state we can have here.
-		 */
-		if (startoffset > fsize) {
-			error = xfs_alloc_file_space(ip, fsize,
-					startoffset - fsize, 0);
-			if (error)
-				break;
-		}
-
-		iattr.ia_valid = ATTR_SIZE;
-		iattr.ia_size = startoffset;
-
-		error = xfs_setattr_size(ip, &iattr);
-
-		if (error)
-			return error;
-
-		clrprealloc = 1;
-		break;
-
-	default:
-		ASSERT(0);
-		return XFS_ERROR(EINVAL);
-	}
-
-	/*
-	 * update the inode timestamp, mode, and prealloc flag bits
-	 */
-	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0);
-	if (error) {
-		xfs_trans_cancel(tp, 0);
-		return error;
-	}
-
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
-	if ((attr_flags & XFS_ATTR_DMI) == 0) {
-		ip->i_d.di_mode &= ~S_ISUID;
-
-		/*
-		 * Note that we don't have to worry about mandatory
-		 * file locking being disabled here because we only
-		 * clear the S_ISGID bit if the Group execute bit is
-		 * on, but if it was on then mandatory locking wouldn't
-		 * have been enabled.
-		 */
-		if (ip->i_d.di_mode & S_IXGRP)
-			ip->i_d.di_mode &= ~S_ISGID;
-
-		xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-	}
-	if (setprealloc)
-		ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
-	else if (clrprealloc)
-		ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
-
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	if (attr_flags & XFS_ATTR_SYNC)
-		xfs_trans_set_sync(tp);
-	return xfs_trans_commit(tp, 0);
-}
-
 /*
  * We need to check that the format of the data fork in the temporary inode is
  * valid for the target inode before doing the swap. This is not a problem with
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 77cf5001719b..900747b25772 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -93,13 +93,12 @@ int	xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 			     int *is_empty);
 
 /* preallocation and hole punch interface */
-int	xfs_change_file_space(struct xfs_inode *ip, int cmd,
-			      xfs_flock64_t *bf, xfs_off_t offset,
-			      int attr_flags);
 int	xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
 			     xfs_off_t len, int alloc_type);
 int	xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
 			    xfs_off_t len);
+int	xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset,
+			    xfs_off_t len);
 
 /* EOF block manipulation functions */
 bool	xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 45287419dc37..e448d735346b 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -641,7 +641,11 @@ xfs_ioc_space(
 	unsigned int		cmd,
 	xfs_flock64_t		*bf)
 {
-	int			attr_flags = 0;
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	struct iattr		iattr;
+	bool			setprealloc = false;
+	bool			clrprealloc = false;
 	int			error;
 
 	/*
@@ -661,17 +665,127 @@ xfs_ioc_space(
 	if (!S_ISREG(inode->i_mode))
 		return -XFS_ERROR(EINVAL);
 
-	if (filp->f_flags & O_DSYNC)
-		attr_flags |= XFS_ATTR_SYNC;
-
-	if (ioflags & IO_INVIS)
-		attr_flags |= XFS_ATTR_DMI;
-
 	error = mnt_want_write_file(filp);
 	if (error)
 		return error;
+
 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
-	error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags);
+
+	switch (bf->l_whence) {
+	case 0: /*SEEK_SET*/
+		break;
+	case 1: /*SEEK_CUR*/
+		bf->l_start += filp->f_pos;
+		break;
+	case 2: /*SEEK_END*/
+		bf->l_start += XFS_ISIZE(ip);
+		break;
+	default:
+		error = XFS_ERROR(EINVAL);
+		goto out_unlock;
+	}
+
+	/*
+	 * length of <= 0 for resv/unresv/zero is invalid.  length for
+	 * alloc/free is ignored completely and we have no idea what userspace
+	 * might have set it to, so set it to zero to allow range
+	 * checks to pass.
+	 */
+	switch (cmd) {
+	case XFS_IOC_ZERO_RANGE:
+	case XFS_IOC_RESVSP:
+	case XFS_IOC_RESVSP64:
+	case XFS_IOC_UNRESVSP:
+	case XFS_IOC_UNRESVSP64:
+		if (bf->l_len <= 0) {
+			error = XFS_ERROR(EINVAL);
+			goto out_unlock;
+		}
+		break;
+	default:
+		bf->l_len = 0;
+		break;
+	}
+
+	if (bf->l_start < 0 ||
+	    bf->l_start > mp->m_super->s_maxbytes ||
+	    bf->l_start + bf->l_len < 0 ||
+	    bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) {
+		error = XFS_ERROR(EINVAL);
+		goto out_unlock;
+	}
+
+	switch (cmd) {
+	case XFS_IOC_ZERO_RANGE:
+		error = xfs_zero_file_space(ip, bf->l_start, bf->l_len);
+		if (!error)
+			setprealloc = true;
+		break;
+	case XFS_IOC_RESVSP:
+	case XFS_IOC_RESVSP64:
+		error = xfs_alloc_file_space(ip, bf->l_start, bf->l_len,
+						XFS_BMAPI_PREALLOC);
+		if (!error)
+			setprealloc = true;
+		break;
+	case XFS_IOC_UNRESVSP:
+	case XFS_IOC_UNRESVSP64:
+		error = xfs_free_file_space(ip, bf->l_start, bf->l_len);
+		break;
+	case XFS_IOC_ALLOCSP:
+	case XFS_IOC_ALLOCSP64:
+	case XFS_IOC_FREESP:
+	case XFS_IOC_FREESP64:
+		if (bf->l_start > XFS_ISIZE(ip)) {
+			error = xfs_alloc_file_space(ip, XFS_ISIZE(ip),
+					bf->l_start - XFS_ISIZE(ip), 0);
+			if (error)
+				goto out_unlock;
+		}
+
+		iattr.ia_valid = ATTR_SIZE;
+		iattr.ia_size = bf->l_start;
+
+		error = xfs_setattr_size(ip, &iattr);
+		if (!error)
+			clrprealloc = true;
+		break;
+	default:
+		ASSERT(0);
+		error = XFS_ERROR(EINVAL);
+	}
+
+	if (error)
+		goto out_unlock;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		goto out_unlock;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+	if (!(ioflags & IO_INVIS)) {
+		ip->i_d.di_mode &= ~S_ISUID;
+		if (ip->i_d.di_mode & S_IXGRP)
+			ip->i_d.di_mode &= ~S_ISGID;
+		xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	}
+
+	if (setprealloc)
+		ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
+	else if (clrprealloc)
+		ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	if (filp->f_flags & O_DSYNC)
+		xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp, 0);
+
+out_unlock:
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 	mnt_drop_write_file(filp);
 	return -error;
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index f3738253c69a..d2c5057b5cc4 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -30,9 +30,7 @@ extern void xfs_setup_inode(struct xfs_inode *);
 /*
  * Internal setattr interfaces.
  */
-#define	XFS_ATTR_DMI		0x01	/* invocation from a DMI function */
-#define XFS_ATTR_NOACL		0x08	/* Don't call xfs_acl_chmod */
-#define XFS_ATTR_SYNC		0x10	/* synchronous operation required */
+#define XFS_ATTR_NOACL		0x01	/* Don't call xfs_acl_chmod */
 
 extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
 			       int flags);
-- 
cgit v1.3


From cffbfa66485e4940091d7e64024e802314d24c09 Mon Sep 17 00:00:00 2001
From: Haicheng Li <haicheng.li@linux.intel.com>
Date: Fri, 18 Oct 2013 17:24:07 +0800
Subject: f2fs: use true and false for boolean value

Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/segment.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 3b203597c744..862fef3ae253 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1277,9 +1277,9 @@ static bool flush_sits_in_journal(struct f2fs_sb_info *sbi)
 			__mark_sit_entry_dirty(sbi, segno);
 		}
 		update_sits_in_cursum(sum, -sits_in_cursum(sum));
-		return 1;
+		return true;
 	}
-	return 0;
+	return false;
 }
 
 /*
-- 
cgit v1.3


From 435f2a1b58ac8f50894f23549c97791085f7cba2 Mon Sep 17 00:00:00 2001
From: Haicheng Li <haicheng.li@linux.intel.com>
Date: Fri, 18 Oct 2013 17:24:08 +0800
Subject: f2fs: no need to check other dirty_segmap when the seg has been found

Because one dirty seg can only be mapped to one dirty_type. Otherwise, it's a bug.

Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
[Jaegeuk Kim: modify a comment related to this patch]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/segment.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 862fef3ae253..8ac161965242 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -78,10 +78,14 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
 	if (dirty_type == DIRTY) {
 		enum dirty_type t = DIRTY_HOT_DATA;
 
-		/* clear all the bitmaps */
-		for (; t <= DIRTY_COLD_NODE; t++)
-			if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
+		/* clear its dirty bitmap */
+		for (; t <= DIRTY_COLD_NODE; t++) {
+			if (test_and_clear_bit(segno,
+						dirty_i->dirty_segmap[t])) {
 				dirty_i->nr_dirty[t]--;
+				break;
+			}
+		}
 
 		if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0)
 			clear_bit(GET_SECNO(sbi, segno),
-- 
cgit v1.3


From 7bd59381c82defe19875284c48b1ac9dacd16e8f Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Tue, 22 Oct 2013 14:52:26 +0800
Subject: f2fs: introduce f2fs_kmem_cache_alloc to hide the unfailed, kmem
 cache allocation

Introduce the unfailed version of kmem_cache_alloc named f2fs_kmem_cache_alloc
to hide the retry routine and make the code a bit cleaner.

v2:
   Fix the wrong use of 'retry' tag pointed out by Gao feng.
   Use more neat code to remove redundant tag suggested by Haicheng Li.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 26 +++++++-------------------
 fs/f2fs/f2fs.h       | 15 +++++++++++++++
 fs/f2fs/gc.c         |  8 ++------
 fs/f2fs/node.c       | 27 +++++++++++----------------
 4 files changed, 35 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 8d16071a3b23..6fb484c84ee7 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -226,12 +226,8 @@ void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 			break;
 		orphan = NULL;
 	}
-retry:
-	new = kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
-	if (!new) {
-		cond_resched();
-		goto retry;
-	}
+
+	new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
 	new->ino = ino;
 
 	/* add new_oentry into list which is sorted by inode number */
@@ -484,12 +480,8 @@ void set_dirty_dir_page(struct inode *inode, struct page *page)
 
 	if (!S_ISDIR(inode->i_mode))
 		return;
-retry:
-	new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
-	if (!new) {
-		cond_resched();
-		goto retry;
-	}
+
+	new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
 	new->inode = inode;
 	INIT_LIST_HEAD(&new->list);
 
@@ -506,13 +498,9 @@ retry:
 void add_dirty_dir_inode(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-	struct dir_inode_entry *new;
-retry:
-	new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
-	if (!new) {
-		cond_resched();
-		goto retry;
-	}
+	struct dir_inode_entry *new =
+			f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
+
 	new->inode = inode;
 	INIT_LIST_HEAD(&new->list);
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 171c52fc95bb..29492754bd7d 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -18,6 +18,7 @@
 #include <linux/crc32.h>
 #include <linux/magic.h>
 #include <linux/kobject.h>
+#include <linux/sched.h>
 
 /*
  * For mount options
@@ -787,6 +788,20 @@ static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name,
 	return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, ctor);
 }
 
+static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
+						gfp_t flags)
+{
+	void *entry;
+retry:
+	entry = kmem_cache_alloc(cachep, flags);
+	if (!entry) {
+		cond_resched();
+		goto retry;
+	}
+
+	return entry;
+}
+
 #define RAW_IS_INODE(p)	((p)->footer.nid == (p)->footer.ino)
 
 static inline bool IS_INODE(struct page *page)
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index fbad96846c7c..7914b92a6967 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -361,12 +361,8 @@ static void add_gc_inode(struct inode *inode, struct list_head *ilist)
 		iput(inode);
 		return;
 	}
-repeat:
-	new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS);
-	if (!new_ie) {
-		cond_resched();
-		goto repeat;
-	}
+
+	new_ie = f2fs_kmem_cache_alloc(winode_slab, GFP_NOFS);
 	new_ie->inode = inode;
 	list_add_tail(&new_ie->list, ilist);
 }
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index ef80f791b02b..4fa3fd5def34 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1296,23 +1296,18 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
 	if (nid == 0)
 		return 0;
 
-	if (!build)
-		goto retry;
-
-	/* do not add allocated nids */
-	read_lock(&nm_i->nat_tree_lock);
-	ne = __lookup_nat_cache(nm_i, nid);
-	if (ne && nat_get_blkaddr(ne) != NULL_ADDR)
-		allocated = true;
-	read_unlock(&nm_i->nat_tree_lock);
-	if (allocated)
-		return 0;
-retry:
-	i = kmem_cache_alloc(free_nid_slab, GFP_NOFS);
-	if (!i) {
-		cond_resched();
-		goto retry;
+	if (build) {
+		/* do not add allocated nids */
+		read_lock(&nm_i->nat_tree_lock);
+		ne = __lookup_nat_cache(nm_i, nid);
+		if (ne && nat_get_blkaddr(ne) != NULL_ADDR)
+			allocated = true;
+		read_unlock(&nm_i->nat_tree_lock);
+		if (allocated)
+			return 0;
 	}
+
+	i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
 	i->nid = nid;
 	i->state = NID_NEW;
 
-- 
cgit v1.3


From e71d1a59e757201454fd3cf66ddaceaf961bfb41 Mon Sep 17 00:00:00 2001
From: "wang.bo116@zte.com.cn" <wang.bo116@zte.com.cn>
Date: Fri, 16 Aug 2013 15:43:36 +0800
Subject: UBIFS: remove unnecessary code in ubifs_garbage_collect

In ubifs_garbage_collect,local variable "space_before" calculate twice. In
fact, at the beginning of the loop, there is no need to calculate this
variable. Calculate it before call "ubifs_garbage_collect_leb" is enough. This
patch just remove the unnecessary calculate code.

Signed-off-by: wang bo <wang.bo116@zte.com.cn>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
---
 fs/ubifs/gc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 76ca53cd3eee..9718da86ad01 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -668,8 +668,7 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
 	ubifs_assert(!wbuf->used);
 
 	for (i = 0; ; i++) {
-		int space_before = c->leb_size - wbuf->offs - wbuf->used;
-		int space_after;
+		int space_before, space_after;
 
 		cond_resched();
 
-- 
cgit v1.3


From 70a9883c5f34b215b8a77665cefd0398edc5a9ef Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 23 Oct 2013 10:36:05 +1100
Subject: xfs: create a shared header file for format-related information

All of the buffer operations structures are needed to be exported
for xfs_db, so move them all to a common location rather than
spreading them all over the place. They are verifying the on-disk
format, so while xfs_format.h might be a good place, it is not part
of the on disk format.

Hence we need to create a new header file that we centralise these
related definitions. Start by moving the bffer operations
structures, and then also move all the other definitions that have
crept into xfs_log_format.h and xfs_format.h as there was no other
shared header file to put them in.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_ag.h             |   4 -
 fs/xfs/xfs_alloc.c          |   3 +-
 fs/xfs/xfs_alloc.h          |   3 -
 fs/xfs/xfs_alloc_btree.h    |   2 -
 fs/xfs/xfs_aops.c           |   4 +-
 fs/xfs/xfs_attr.c           |   1 +
 fs/xfs/xfs_attr_inactive.c  |   1 +
 fs/xfs/xfs_attr_leaf.h      |   2 -
 fs/xfs/xfs_attr_remote.h    |   2 -
 fs/xfs/xfs_bmap.c           |   1 +
 fs/xfs/xfs_bmap_btree.h     |   2 -
 fs/xfs/xfs_bmap_util.c      |   1 +
 fs/xfs/xfs_btree.c          |   2 +-
 fs/xfs/xfs_da_btree.c       |   3 +-
 fs/xfs/xfs_da_btree.h       |   2 -
 fs/xfs/xfs_dquot.c          |   1 +
 fs/xfs/xfs_dquot.h          |   2 -
 fs/xfs/xfs_extent_busy.c    |   3 +-
 fs/xfs/xfs_file.c           |   1 +
 fs/xfs/xfs_format.h         |  10 --
 fs/xfs/xfs_fsops.c          |   2 +
 fs/xfs/xfs_ialloc.c         |   3 +-
 fs/xfs/xfs_ialloc.h         |   2 -
 fs/xfs/xfs_ialloc_btree.h   |   2 -
 fs/xfs/xfs_icreate_item.c   |   2 +-
 fs/xfs/xfs_inode.c          |   1 +
 fs/xfs/xfs_inode_buf.c      |   1 +
 fs/xfs/xfs_inode_buf.h      |   3 -
 fs/xfs/xfs_ioctl.c          |   1 +
 fs/xfs/xfs_iomap.c          |   1 +
 fs/xfs/xfs_iops.c           |   1 +
 fs/xfs/xfs_itable.c         |   2 +-
 fs/xfs/xfs_log.c            |   2 +-
 fs/xfs/xfs_log_cil.c        |   2 +-
 fs/xfs/xfs_log_format.h     | 177 --------------------------------
 fs/xfs/xfs_log_recover.c    |   1 +
 fs/xfs/xfs_log_rlimit.c     |   1 +
 fs/xfs/xfs_mount.c          |   1 +
 fs/xfs/xfs_qm.c             |   1 +
 fs/xfs/xfs_qm_syscalls.c    |   1 +
 fs/xfs/xfs_quota.h          |   2 -
 fs/xfs/xfs_rtalloc.c        |   1 +
 fs/xfs/xfs_sb.h             |   3 -
 fs/xfs/xfs_shared.h         | 244 ++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_super.c          |   1 +
 fs/xfs/xfs_symlink.c        |   1 +
 fs/xfs/xfs_symlink_remote.c |   1 +
 fs/xfs/xfs_trace.c          |   2 +-
 fs/xfs/xfs_trans.c          |   1 +
 fs/xfs/xfs_trans_buf.c      |   2 +-
 fs/xfs/xfs_trans_dquot.c    |   1 +
 fs/xfs/xfs_trans_extfree.c  |   2 +-
 fs/xfs/xfs_trans_inode.c    |   2 +-
 fs/xfs/xfs_trans_resv.c     |   1 +
 54 files changed, 289 insertions(+), 232 deletions(-)
 create mode 100644 fs/xfs/xfs_shared.h

(limited to 'fs')

diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 1cb740afd674..3fc109819c34 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -128,8 +128,6 @@ typedef struct xfs_agf {
 extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
 			xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
 
-extern const struct xfs_buf_ops xfs_agf_buf_ops;
-
 /*
  * Size of the unlinked inode hash table in the agi.
  */
@@ -191,8 +189,6 @@ typedef struct xfs_agi {
 extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
 				xfs_agnumber_t agno, struct xfs_buf **bpp);
 
-extern const struct xfs_buf_ops xfs_agi_buf_ops;
-
 /*
  * The third a.g. block contains the a.g. freelist, an array
  * of block pointers to blocks owned by the allocation btree code.
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 5a1393f5e020..2bfca06e0ee7 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -17,7 +17,8 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 99d0a6101558..feacb061bab7 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -231,7 +231,4 @@ xfs_alloc_get_rec(
 	xfs_extlen_t		*len,	/* output: length of extent */
 	int			*stat);	/* output: success/failure */
 
-extern const struct xfs_buf_ops xfs_agf_buf_ops;
-extern const struct xfs_buf_ops xfs_agfl_buf_ops;
-
 #endif	/* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index e3a3f7424192..72676c36b1bb 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -95,6 +95,4 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
 		xfs_agnumber_t, xfs_btnum_t);
 extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
 
-extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
-
 #endif	/* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f3f95f094af8..2c61e9622f57 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -16,9 +16,11 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
-#include "xfs_log.h"
+#include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
+#include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index ddcf2267ffa6..21af043bbee8 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index bb24b07cbedb..e90a8760bc3d 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -19,6 +19,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index c1022138c7e6..da34e64b5323 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -336,6 +336,4 @@ void	xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to,
 void	xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to,
 				   struct xfs_attr3_icleaf_hdr *from);
 
-extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
-
 #endif	/* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/xfs_attr_remote.h
index 92a8fd7977cc..b3e1e1bdfb3b 100644
--- a/fs/xfs/xfs_attr_remote.h
+++ b/fs/xfs/xfs_attr_remote.h
@@ -45,8 +45,6 @@ struct xfs_attr3_rmt_hdr {
 	((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
 			sizeof(struct xfs_attr3_rmt_hdr) : 0))
 
-extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
-
 int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
 
 int xfs_attr_rmtval_get(struct xfs_da_args *args);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index e6b42026f9d4..6870aad5d815 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index e367461a638e..e2b0768a89ba 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -243,6 +243,4 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
 extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
 		struct xfs_trans *, struct xfs_inode *, int);
 
-extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
-
 #endif	/* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 45cf5a2fd1bf..4f246628a900 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -19,6 +19,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 5690e102243d..1a297e26e26c 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -17,7 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 20bf8e8002d6..ac51abeb4782 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -18,7 +18,8 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index b1f267995dea..a762cfa4097d 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -309,8 +309,6 @@ int	xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
 			 xfs_dablk_t bno, xfs_daddr_t mappedbno,
 			 struct xfs_buf **bpp, int which_fork);
 
-extern const struct xfs_buf_ops xfs_da3_node_buf_ops;
-
 /*
  * Utility routines.
  */
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 1ee776d477c3..8b1d2c0a9054 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 55abbca2883d..d22ed0053c32 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -172,6 +172,4 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
 	return dqp;
 }
 
-extern const struct xfs_buf_ops xfs_dquot_buf_ops;
-
 #endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index e43708e2f080..74828a7895eb 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -19,7 +19,8 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c8a75a75e25a..57d95b2f8692 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -17,6 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_log.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
diff --git a/fs/xfs/xfs_format.h b/fs/xfs/xfs_format.h
index 35c08ff54ca0..a790428f3bf4 100644
--- a/fs/xfs/xfs_format.h
+++ b/fs/xfs/xfs_format.h
@@ -156,14 +156,4 @@ struct xfs_dsymlink_hdr {
 	((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
 			sizeof(struct xfs_dsymlink_hdr) : 0))
 
-int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
-int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
-			uint32_t size, struct xfs_buf *bp);
-bool xfs_symlink_hdr_ok(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
-			uint32_t size, struct xfs_buf *bp);
-void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
-				 struct xfs_inode *ip, struct xfs_ifork *ifp);
-
-extern const struct xfs_buf_ops xfs_symlink_buf_ops;
-
 #endif /* __XFS_FORMAT_H__ */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 76c7b2b4fa8d..e5d2c43c65c4 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -18,6 +18,8 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_types.h"
+#include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index ccf2fb143962..8530cabf128d 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -17,7 +17,8 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 68c07320f096..155779803d67 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -158,6 +158,4 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
 			  xfs_agnumber_t agno, xfs_agblock_t agbno,
 			  xfs_agblock_t length, unsigned int gen);
 
-extern const struct xfs_buf_ops xfs_agi_buf_ops;
-
 #endif	/* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index 3ac36b7642e9..cfbfe461abbc 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -110,6 +110,4 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
 		struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
 extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
 
-extern const struct xfs_buf_ops xfs_inobt_buf_ops;
-
 #endif	/* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 5a5a593994d4..df8baf38eb85 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -17,7 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 5fce7261a64b..1aaef689794d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -20,6 +20,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c
index 63382d37f565..65fa56edecf3 100644
--- a/fs/xfs/xfs_inode_buf.c
+++ b/fs/xfs/xfs_inode_buf.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
diff --git a/fs/xfs/xfs_inode_buf.h b/fs/xfs/xfs_inode_buf.h
index abba0ae8cf2d..9308c47f2a52 100644
--- a/fs/xfs/xfs_inode_buf.h
+++ b/fs/xfs/xfs_inode_buf.h
@@ -47,7 +47,4 @@ void	xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
 #define	xfs_inobp_check(mp, bp)
 #endif /* DEBUG */
 
-extern const struct xfs_buf_ops xfs_inode_buf_ops;
-extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
-
 #endif	/* __XFS_INODE_BUF_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index e448d735346b..c6c62d12a934 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 338b33263c56..d8fb6dfe75ed 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 52ecd1ccec9f..24fe333b8c41 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_acl.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 084b3e1741fd..4f50e637f33b 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -17,7 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 0461f0872244..fbaf70af3d61 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -17,7 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index da8524e779b6..a844f77d2cc3 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -17,7 +17,7 @@
 
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/xfs_log_format.h
index ca7e28a8ed31..f0969c77bdbe 100644
--- a/fs/xfs/xfs_log_format.h
+++ b/fs/xfs/xfs_log_format.h
@@ -233,178 +233,6 @@ typedef struct xfs_trans_header {
 	{ XFS_LI_QUOTAOFF,	"XFS_LI_QUOTAOFF" }, \
 	{ XFS_LI_ICREATE,	"XFS_LI_ICREATE" }
 
-/*
- * Transaction types.  Used to distinguish types of buffers.
- */
-#define XFS_TRANS_SETATTR_NOT_SIZE	1
-#define XFS_TRANS_SETATTR_SIZE		2
-#define XFS_TRANS_INACTIVE		3
-#define XFS_TRANS_CREATE		4
-#define XFS_TRANS_CREATE_TRUNC		5
-#define XFS_TRANS_TRUNCATE_FILE		6
-#define XFS_TRANS_REMOVE		7
-#define XFS_TRANS_LINK			8
-#define XFS_TRANS_RENAME		9
-#define XFS_TRANS_MKDIR			10
-#define XFS_TRANS_RMDIR			11
-#define XFS_TRANS_SYMLINK		12
-#define XFS_TRANS_SET_DMATTRS		13
-#define XFS_TRANS_GROWFS		14
-#define XFS_TRANS_STRAT_WRITE		15
-#define XFS_TRANS_DIOSTRAT		16
-/* 17 was XFS_TRANS_WRITE_SYNC */
-#define	XFS_TRANS_WRITEID		18
-#define	XFS_TRANS_ADDAFORK		19
-#define	XFS_TRANS_ATTRINVAL		20
-#define	XFS_TRANS_ATRUNCATE		21
-#define	XFS_TRANS_ATTR_SET		22
-#define	XFS_TRANS_ATTR_RM		23
-#define	XFS_TRANS_ATTR_FLAG		24
-#define	XFS_TRANS_CLEAR_AGI_BUCKET	25
-#define XFS_TRANS_QM_SBCHANGE		26
-/*
- * Dummy entries since we use the transaction type to index into the
- * trans_type[] in xlog_recover_print_trans_head()
- */
-#define XFS_TRANS_DUMMY1		27
-#define XFS_TRANS_DUMMY2		28
-#define XFS_TRANS_QM_QUOTAOFF		29
-#define XFS_TRANS_QM_DQALLOC		30
-#define XFS_TRANS_QM_SETQLIM		31
-#define XFS_TRANS_QM_DQCLUSTER		32
-#define XFS_TRANS_QM_QINOCREATE		33
-#define XFS_TRANS_QM_QUOTAOFF_END	34
-#define XFS_TRANS_SB_UNIT		35
-#define XFS_TRANS_FSYNC_TS		36
-#define	XFS_TRANS_GROWFSRT_ALLOC	37
-#define	XFS_TRANS_GROWFSRT_ZERO		38
-#define	XFS_TRANS_GROWFSRT_FREE		39
-#define	XFS_TRANS_SWAPEXT		40
-#define	XFS_TRANS_SB_COUNT		41
-#define	XFS_TRANS_CHECKPOINT		42
-#define	XFS_TRANS_ICREATE		43
-#define	XFS_TRANS_TYPE_MAX		43
-/* new transaction types need to be reflected in xfs_logprint(8) */
-
-#define XFS_TRANS_TYPES \
-	{ XFS_TRANS_SETATTR_NOT_SIZE,	"SETATTR_NOT_SIZE" }, \
-	{ XFS_TRANS_SETATTR_SIZE,	"SETATTR_SIZE" }, \
-	{ XFS_TRANS_INACTIVE,		"INACTIVE" }, \
-	{ XFS_TRANS_CREATE,		"CREATE" }, \
-	{ XFS_TRANS_CREATE_TRUNC,	"CREATE_TRUNC" }, \
-	{ XFS_TRANS_TRUNCATE_FILE,	"TRUNCATE_FILE" }, \
-	{ XFS_TRANS_REMOVE,		"REMOVE" }, \
-	{ XFS_TRANS_LINK,		"LINK" }, \
-	{ XFS_TRANS_RENAME,		"RENAME" }, \
-	{ XFS_TRANS_MKDIR,		"MKDIR" }, \
-	{ XFS_TRANS_RMDIR,		"RMDIR" }, \
-	{ XFS_TRANS_SYMLINK,		"SYMLINK" }, \
-	{ XFS_TRANS_SET_DMATTRS,	"SET_DMATTRS" }, \
-	{ XFS_TRANS_GROWFS,		"GROWFS" }, \
-	{ XFS_TRANS_STRAT_WRITE,	"STRAT_WRITE" }, \
-	{ XFS_TRANS_DIOSTRAT,		"DIOSTRAT" }, \
-	{ XFS_TRANS_WRITEID,		"WRITEID" }, \
-	{ XFS_TRANS_ADDAFORK,		"ADDAFORK" }, \
-	{ XFS_TRANS_ATTRINVAL,		"ATTRINVAL" }, \
-	{ XFS_TRANS_ATRUNCATE,		"ATRUNCATE" }, \
-	{ XFS_TRANS_ATTR_SET,		"ATTR_SET" }, \
-	{ XFS_TRANS_ATTR_RM,		"ATTR_RM" }, \
-	{ XFS_TRANS_ATTR_FLAG,		"ATTR_FLAG" }, \
-	{ XFS_TRANS_CLEAR_AGI_BUCKET,	"CLEAR_AGI_BUCKET" }, \
-	{ XFS_TRANS_QM_SBCHANGE,	"QM_SBCHANGE" }, \
-	{ XFS_TRANS_QM_QUOTAOFF,	"QM_QUOTAOFF" }, \
-	{ XFS_TRANS_QM_DQALLOC,		"QM_DQALLOC" }, \
-	{ XFS_TRANS_QM_SETQLIM,		"QM_SETQLIM" }, \
-	{ XFS_TRANS_QM_DQCLUSTER,	"QM_DQCLUSTER" }, \
-	{ XFS_TRANS_QM_QINOCREATE,	"QM_QINOCREATE" }, \
-	{ XFS_TRANS_QM_QUOTAOFF_END,	"QM_QOFF_END" }, \
-	{ XFS_TRANS_SB_UNIT,		"SB_UNIT" }, \
-	{ XFS_TRANS_FSYNC_TS,		"FSYNC_TS" }, \
-	{ XFS_TRANS_GROWFSRT_ALLOC,	"GROWFSRT_ALLOC" }, \
-	{ XFS_TRANS_GROWFSRT_ZERO,	"GROWFSRT_ZERO" }, \
-	{ XFS_TRANS_GROWFSRT_FREE,	"GROWFSRT_FREE" }, \
-	{ XFS_TRANS_SWAPEXT,		"SWAPEXT" }, \
-	{ XFS_TRANS_SB_COUNT,		"SB_COUNT" }, \
-	{ XFS_TRANS_CHECKPOINT,		"CHECKPOINT" }, \
-	{ XFS_TRANS_DUMMY1,		"DUMMY1" }, \
-	{ XFS_TRANS_DUMMY2,		"DUMMY2" }, \
-	{ XLOG_UNMOUNT_REC_TYPE,	"UNMOUNT" }
-
-/*
- * This structure is used to track log items associated with
- * a transaction.  It points to the log item and keeps some
- * flags to track the state of the log item.  It also tracks
- * the amount of space needed to log the item it describes
- * once we get to commit processing (see xfs_trans_commit()).
- */
-struct xfs_log_item_desc {
-	struct xfs_log_item	*lid_item;
-	struct list_head	lid_trans;
-	unsigned char		lid_flags;
-};
-
-#define XFS_LID_DIRTY		0x1
-
-/*
- * Values for t_flags.
- */
-#define	XFS_TRANS_DIRTY		0x01	/* something needs to be logged */
-#define	XFS_TRANS_SB_DIRTY	0x02	/* superblock is modified */
-#define	XFS_TRANS_PERM_LOG_RES	0x04	/* xact took a permanent log res */
-#define	XFS_TRANS_SYNC		0x08	/* make commit synchronous */
-#define XFS_TRANS_DQ_DIRTY	0x10	/* at least one dquot in trx dirty */
-#define XFS_TRANS_RESERVE	0x20    /* OK to use reserved data blocks */
-#define XFS_TRANS_FREEZE_PROT	0x40	/* Transaction has elevated writer
-					   count in superblock */
-
-/*
- * Values for call flags parameter.
- */
-#define	XFS_TRANS_RELEASE_LOG_RES	0x4
-#define	XFS_TRANS_ABORT			0x8
-
-/*
- * Field values for xfs_trans_mod_sb.
- */
-#define	XFS_TRANS_SB_ICOUNT		0x00000001
-#define	XFS_TRANS_SB_IFREE		0x00000002
-#define	XFS_TRANS_SB_FDBLOCKS		0x00000004
-#define	XFS_TRANS_SB_RES_FDBLOCKS	0x00000008
-#define	XFS_TRANS_SB_FREXTENTS		0x00000010
-#define	XFS_TRANS_SB_RES_FREXTENTS	0x00000020
-#define	XFS_TRANS_SB_DBLOCKS		0x00000040
-#define	XFS_TRANS_SB_AGCOUNT		0x00000080
-#define	XFS_TRANS_SB_IMAXPCT		0x00000100
-#define	XFS_TRANS_SB_REXTSIZE		0x00000200
-#define	XFS_TRANS_SB_RBMBLOCKS		0x00000400
-#define	XFS_TRANS_SB_RBLOCKS		0x00000800
-#define	XFS_TRANS_SB_REXTENTS		0x00001000
-#define	XFS_TRANS_SB_REXTSLOG		0x00002000
-
-/*
- * Here we centralize the specification of XFS meta-data buffer
- * reference count values.  This determine how hard the buffer
- * cache tries to hold onto the buffer.
- */
-#define	XFS_AGF_REF		4
-#define	XFS_AGI_REF		4
-#define	XFS_AGFL_REF		3
-#define	XFS_INO_BTREE_REF	3
-#define	XFS_ALLOC_BTREE_REF	2
-#define	XFS_BMAP_BTREE_REF	2
-#define	XFS_DIR_BTREE_REF	2
-#define	XFS_INO_REF		2
-#define	XFS_ATTR_BTREE_REF	1
-#define	XFS_DQUOT_REF		1
-
-/*
- * Flags for xfs_trans_ichgtime().
- */
-#define	XFS_ICHGTIME_MOD	0x1	/* data fork modification timestamp */
-#define	XFS_ICHGTIME_CHG	0x2	/* inode field change timestamp */
-#define	XFS_ICHGTIME_CREATE	0x4	/* inode create timestamp */
-
-
 /*
  * Inode Log Item Format definitions.
  *
@@ -797,7 +625,6 @@ typedef struct xfs_qoff_logformat {
 	char			qf_pad[12];	/* padding for future */
 } xfs_qoff_logformat_t;
 
-
 /*
  * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
  */
@@ -849,8 +676,4 @@ struct xfs_icreate_log {
 	__be32		icl_gen;	/* inode generation number to use */
 };
 
-int	xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
-int	xfs_log_calc_minimum_size(struct xfs_mount *);
-
-
 #endif /* __XFS_LOG_FORMAT_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index e7ca48e8f515..cbefc1235083 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/xfs_log_rlimit.c
index bbcec0bbc12d..0961310abe46 100644
--- a/fs/xfs/xfs_log_rlimit.c
+++ b/fs/xfs/xfs_log_rlimit.c
@@ -17,6 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_ag.h"
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5dcc68019d1b..9515f96b4ab7 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 3e6c2e6c9cd2..dc2d4f7ad7c9 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index fb6709191811..174390703eb6 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -21,6 +21,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index e7d84d2d8683..84661d4431fa 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -154,6 +154,4 @@ extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
 				xfs_dqid_t, uint, uint, char *);
 extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
 
-extern const struct xfs_buf_ops xfs_dquot_buf_ops;
-
 #endif	/* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 6f9e63c9fc26..6f63de303e8e 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 6835b44f850e..35061d4b614c 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -699,7 +699,4 @@ extern void	xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
 extern void	xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
 extern void	xfs_sb_quota_from_disk(struct xfs_sb *sbp);
 
-extern const struct xfs_buf_ops xfs_sb_buf_ops;
-extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
-
 #endif	/* __XFS_SB_H__ */
diff --git a/fs/xfs/xfs_shared.h b/fs/xfs/xfs_shared.h
new file mode 100644
index 000000000000..8c5035a13df1
--- /dev/null
+++ b/fs/xfs/xfs_shared.h
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SHARED_H__
+#define __XFS_SHARED_H__
+
+/*
+ * Definitions shared between kernel and userspace that don't fit into any other
+ * header file that is shared with userspace.
+ */
+struct xfs_ifork;
+struct xfs_buf;
+struct xfs_buf_ops;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_inode;
+
+/*
+ * Buffer verifier operations are widely used, including userspace tools
+ */
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+extern const struct xfs_buf_ops xfs_agfl_buf_ops;
+extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
+extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
+extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
+extern const struct xfs_buf_ops xfs_da3_node_buf_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
+extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+extern const struct xfs_buf_ops xfs_inode_buf_ops;
+extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+extern const struct xfs_buf_ops xfs_sb_buf_ops;
+extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
+extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+
+/*
+ * Transaction types.  Used to distinguish types of buffers. These never reach
+ * the log.
+ */
+#define XFS_TRANS_SETATTR_NOT_SIZE	1
+#define XFS_TRANS_SETATTR_SIZE		2
+#define XFS_TRANS_INACTIVE		3
+#define XFS_TRANS_CREATE		4
+#define XFS_TRANS_CREATE_TRUNC		5
+#define XFS_TRANS_TRUNCATE_FILE		6
+#define XFS_TRANS_REMOVE		7
+#define XFS_TRANS_LINK			8
+#define XFS_TRANS_RENAME		9
+#define XFS_TRANS_MKDIR			10
+#define XFS_TRANS_RMDIR			11
+#define XFS_TRANS_SYMLINK		12
+#define XFS_TRANS_SET_DMATTRS		13
+#define XFS_TRANS_GROWFS		14
+#define XFS_TRANS_STRAT_WRITE		15
+#define XFS_TRANS_DIOSTRAT		16
+/* 17 was XFS_TRANS_WRITE_SYNC */
+#define	XFS_TRANS_WRITEID		18
+#define	XFS_TRANS_ADDAFORK		19
+#define	XFS_TRANS_ATTRINVAL		20
+#define	XFS_TRANS_ATRUNCATE		21
+#define	XFS_TRANS_ATTR_SET		22
+#define	XFS_TRANS_ATTR_RM		23
+#define	XFS_TRANS_ATTR_FLAG		24
+#define	XFS_TRANS_CLEAR_AGI_BUCKET	25
+#define XFS_TRANS_QM_SBCHANGE		26
+/*
+ * Dummy entries since we use the transaction type to index into the
+ * trans_type[] in xlog_recover_print_trans_head()
+ */
+#define XFS_TRANS_DUMMY1		27
+#define XFS_TRANS_DUMMY2		28
+#define XFS_TRANS_QM_QUOTAOFF		29
+#define XFS_TRANS_QM_DQALLOC		30
+#define XFS_TRANS_QM_SETQLIM		31
+#define XFS_TRANS_QM_DQCLUSTER		32
+#define XFS_TRANS_QM_QINOCREATE		33
+#define XFS_TRANS_QM_QUOTAOFF_END	34
+#define XFS_TRANS_SB_UNIT		35
+#define XFS_TRANS_FSYNC_TS		36
+#define	XFS_TRANS_GROWFSRT_ALLOC	37
+#define	XFS_TRANS_GROWFSRT_ZERO		38
+#define	XFS_TRANS_GROWFSRT_FREE		39
+#define	XFS_TRANS_SWAPEXT		40
+#define	XFS_TRANS_SB_COUNT		41
+#define	XFS_TRANS_CHECKPOINT		42
+#define	XFS_TRANS_ICREATE		43
+#define	XFS_TRANS_TYPE_MAX		43
+/* new transaction types need to be reflected in xfs_logprint(8) */
+
+#define XFS_TRANS_TYPES \
+	{ XFS_TRANS_SETATTR_NOT_SIZE,	"SETATTR_NOT_SIZE" }, \
+	{ XFS_TRANS_SETATTR_SIZE,	"SETATTR_SIZE" }, \
+	{ XFS_TRANS_INACTIVE,		"INACTIVE" }, \
+	{ XFS_TRANS_CREATE,		"CREATE" }, \
+	{ XFS_TRANS_CREATE_TRUNC,	"CREATE_TRUNC" }, \
+	{ XFS_TRANS_TRUNCATE_FILE,	"TRUNCATE_FILE" }, \
+	{ XFS_TRANS_REMOVE,		"REMOVE" }, \
+	{ XFS_TRANS_LINK,		"LINK" }, \
+	{ XFS_TRANS_RENAME,		"RENAME" }, \
+	{ XFS_TRANS_MKDIR,		"MKDIR" }, \
+	{ XFS_TRANS_RMDIR,		"RMDIR" }, \
+	{ XFS_TRANS_SYMLINK,		"SYMLINK" }, \
+	{ XFS_TRANS_SET_DMATTRS,	"SET_DMATTRS" }, \
+	{ XFS_TRANS_GROWFS,		"GROWFS" }, \
+	{ XFS_TRANS_STRAT_WRITE,	"STRAT_WRITE" }, \
+	{ XFS_TRANS_DIOSTRAT,		"DIOSTRAT" }, \
+	{ XFS_TRANS_WRITEID,		"WRITEID" }, \
+	{ XFS_TRANS_ADDAFORK,		"ADDAFORK" }, \
+	{ XFS_TRANS_ATTRINVAL,		"ATTRINVAL" }, \
+	{ XFS_TRANS_ATRUNCATE,		"ATRUNCATE" }, \
+	{ XFS_TRANS_ATTR_SET,		"ATTR_SET" }, \
+	{ XFS_TRANS_ATTR_RM,		"ATTR_RM" }, \
+	{ XFS_TRANS_ATTR_FLAG,		"ATTR_FLAG" }, \
+	{ XFS_TRANS_CLEAR_AGI_BUCKET,	"CLEAR_AGI_BUCKET" }, \
+	{ XFS_TRANS_QM_SBCHANGE,	"QM_SBCHANGE" }, \
+	{ XFS_TRANS_QM_QUOTAOFF,	"QM_QUOTAOFF" }, \
+	{ XFS_TRANS_QM_DQALLOC,		"QM_DQALLOC" }, \
+	{ XFS_TRANS_QM_SETQLIM,		"QM_SETQLIM" }, \
+	{ XFS_TRANS_QM_DQCLUSTER,	"QM_DQCLUSTER" }, \
+	{ XFS_TRANS_QM_QINOCREATE,	"QM_QINOCREATE" }, \
+	{ XFS_TRANS_QM_QUOTAOFF_END,	"QM_QOFF_END" }, \
+	{ XFS_TRANS_SB_UNIT,		"SB_UNIT" }, \
+	{ XFS_TRANS_FSYNC_TS,		"FSYNC_TS" }, \
+	{ XFS_TRANS_GROWFSRT_ALLOC,	"GROWFSRT_ALLOC" }, \
+	{ XFS_TRANS_GROWFSRT_ZERO,	"GROWFSRT_ZERO" }, \
+	{ XFS_TRANS_GROWFSRT_FREE,	"GROWFSRT_FREE" }, \
+	{ XFS_TRANS_SWAPEXT,		"SWAPEXT" }, \
+	{ XFS_TRANS_SB_COUNT,		"SB_COUNT" }, \
+	{ XFS_TRANS_CHECKPOINT,		"CHECKPOINT" }, \
+	{ XFS_TRANS_DUMMY1,		"DUMMY1" }, \
+	{ XFS_TRANS_DUMMY2,		"DUMMY2" }, \
+	{ XLOG_UNMOUNT_REC_TYPE,	"UNMOUNT" }
+
+/*
+ * This structure is used to track log items associated with
+ * a transaction.  It points to the log item and keeps some
+ * flags to track the state of the log item.  It also tracks
+ * the amount of space needed to log the item it describes
+ * once we get to commit processing (see xfs_trans_commit()).
+ */
+struct xfs_log_item_desc {
+	struct xfs_log_item	*lid_item;
+	struct list_head	lid_trans;
+	unsigned char		lid_flags;
+};
+
+#define XFS_LID_DIRTY		0x1
+
+/* log size calculation functions */
+int	xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
+int	xfs_log_calc_minimum_size(struct xfs_mount *);
+
+
+/*
+ * Values for t_flags.
+ */
+#define	XFS_TRANS_DIRTY		0x01	/* something needs to be logged */
+#define	XFS_TRANS_SB_DIRTY	0x02	/* superblock is modified */
+#define	XFS_TRANS_PERM_LOG_RES	0x04	/* xact took a permanent log res */
+#define	XFS_TRANS_SYNC		0x08	/* make commit synchronous */
+#define XFS_TRANS_DQ_DIRTY	0x10	/* at least one dquot in trx dirty */
+#define XFS_TRANS_RESERVE	0x20    /* OK to use reserved data blocks */
+#define XFS_TRANS_FREEZE_PROT	0x40	/* Transaction has elevated writer
+					   count in superblock */
+/*
+ * Values for call flags parameter.
+ */
+#define	XFS_TRANS_RELEASE_LOG_RES	0x4
+#define	XFS_TRANS_ABORT			0x8
+
+/*
+ * Field values for xfs_trans_mod_sb.
+ */
+#define	XFS_TRANS_SB_ICOUNT		0x00000001
+#define	XFS_TRANS_SB_IFREE		0x00000002
+#define	XFS_TRANS_SB_FDBLOCKS		0x00000004
+#define	XFS_TRANS_SB_RES_FDBLOCKS	0x00000008
+#define	XFS_TRANS_SB_FREXTENTS		0x00000010
+#define	XFS_TRANS_SB_RES_FREXTENTS	0x00000020
+#define	XFS_TRANS_SB_DBLOCKS		0x00000040
+#define	XFS_TRANS_SB_AGCOUNT		0x00000080
+#define	XFS_TRANS_SB_IMAXPCT		0x00000100
+#define	XFS_TRANS_SB_REXTSIZE		0x00000200
+#define	XFS_TRANS_SB_RBMBLOCKS		0x00000400
+#define	XFS_TRANS_SB_RBLOCKS		0x00000800
+#define	XFS_TRANS_SB_REXTENTS		0x00001000
+#define	XFS_TRANS_SB_REXTSLOG		0x00002000
+
+/*
+ * Here we centralize the specification of XFS meta-data buffer reference count
+ * values.  This determines how hard the buffer cache tries to hold onto the
+ * buffer.
+ */
+#define	XFS_AGF_REF		4
+#define	XFS_AGI_REF		4
+#define	XFS_AGFL_REF		3
+#define	XFS_INO_BTREE_REF	3
+#define	XFS_ALLOC_BTREE_REF	2
+#define	XFS_BMAP_BTREE_REF	2
+#define	XFS_DIR_BTREE_REF	2
+#define	XFS_INO_REF		2
+#define	XFS_ATTR_BTREE_REF	1
+#define	XFS_DQUOT_REF		1
+
+/*
+ * Flags for xfs_trans_ichgtime().
+ */
+#define	XFS_ICHGTIME_MOD	0x1	/* data fork modification timestamp */
+#define	XFS_ICHGTIME_CHG	0x2	/* inode field change timestamp */
+#define	XFS_ICHGTIME_CREATE	0x4	/* inode create timestamp */
+
+
+/*
+ * Symlink decoding/encoding functions
+ */
+int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
+int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
+			uint32_t size, struct xfs_buf *bp);
+bool xfs_symlink_hdr_ok(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
+			uint32_t size, struct xfs_buf *bp);
+void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
+				 struct xfs_inode *ip, struct xfs_ifork *ifp);
+
+#endif /* __XFS_SHARED_H__ */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index c2e3e9535b26..4886b35b1ef1 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -18,6 +18,7 @@
 
 #include "xfs.h"
 #include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index ded282b10bc2..cba781edb192 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -19,6 +19,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/xfs_symlink_remote.c
index 01c85e3f6470..4251cd8e8fec 100644
--- a/fs/xfs/xfs_symlink_remote.c
+++ b/fs/xfs/xfs_symlink_remote.c
@@ -19,6 +19,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_ag.h"
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 5d7b3e40705f..29705391c896 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -17,8 +17,8 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
 #include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 5411e01ab452..3693ce279143 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -19,6 +19,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 8c75b8f67270..0167a65ec851 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -17,7 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 54ee3c5dee76..4d630d2c2804 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 8d71b16eccae..3bfb98bec050 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -17,7 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 53dfe46f3680..dcab52ec9cb6 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -17,7 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
index a65a3cc40610..b5d952dbe6a4 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/xfs_trans_resv.c
@@ -19,6 +19,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_shared.h"
 #include "xfs_log.h"
 #include "xfs_trans_resv.h"
 #include "xfs_trans.h"
-- 
cgit v1.3


From 5706278758e334bf6a15f57c18dc16df19c83957 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 15 Oct 2013 09:17:51 +1100
Subject: xfs: unify directory/attribute format definitions

The on-disk format definitions for the directory and attribute
structures are spread across 3 header files right now, only one of
which is dedicated to defining on-disk structures and their
manipulation (xfs_dir2_format.h). Pull all the format definitions
into a single header file - xfs_da_format.h - and switch all the
code over to point at that.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_attr.c          |    1 +
 fs/xfs/xfs_attr_inactive.c |    1 +
 fs/xfs/xfs_attr_leaf.c     |    1 +
 fs/xfs/xfs_attr_leaf.h     |  230 --------
 fs/xfs/xfs_attr_list.c     |    1 +
 fs/xfs/xfs_attr_remote.c   |    1 +
 fs/xfs/xfs_attr_remote.h   |   27 -
 fs/xfs/xfs_bmap.c          |    2 +-
 fs/xfs/xfs_bmap_util.c     |    2 +-
 fs/xfs/xfs_da_btree.c      |    2 +-
 fs/xfs/xfs_da_btree.h      |  140 -----
 fs/xfs/xfs_da_format.h     | 1362 ++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_dir2.c          |    2 +-
 fs/xfs/xfs_dir2_block.c    |    2 +-
 fs/xfs/xfs_dir2_data.c     |    2 +-
 fs/xfs/xfs_dir2_format.h   |  964 -------------------------------
 fs/xfs/xfs_dir2_leaf.c     |    2 +-
 fs/xfs/xfs_dir2_node.c     |    2 +-
 fs/xfs/xfs_dir2_readdir.c  |    2 +-
 fs/xfs/xfs_dir2_sf.c       |    2 +-
 fs/xfs/xfs_export.c        |    3 +-
 fs/xfs/xfs_file.c          |    4 +-
 fs/xfs/xfs_inode.c         |    2 +-
 fs/xfs/xfs_iops.c          |    2 +-
 fs/xfs/xfs_log_recover.c   |    2 +-
 fs/xfs/xfs_log_rlimit.c    |    1 +
 fs/xfs/xfs_mount.c         |    3 +-
 fs/xfs/xfs_sb.c            |    2 +-
 fs/xfs/xfs_super.c         |    2 +-
 fs/xfs/xfs_symlink.c       |    3 +-
 fs/xfs/xfs_trace.c         |    3 +-
 fs/xfs/xfs_trans.c         |    2 +-
 fs/xfs/xfs_trans_resv.c    |    1 +
 fs/xfs/xfs_xattr.c         |    7 +-
 34 files changed, 1398 insertions(+), 1387 deletions(-)
 create mode 100644 fs/xfs/xfs_da_format.h
 delete mode 100644 fs/xfs/xfs_dir2_format.h

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 21af043bbee8..dd0f48ff8832 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -26,6 +26,7 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_attr_sf.h"
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index e90a8760bc3d..60b60c008fb7 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -26,6 +26,7 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 86db20a9cc02..94838fe67386 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -26,6 +26,7 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index da34e64b5323..3ec5ec0b8678 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -19,16 +19,6 @@
 #ifndef __XFS_ATTR_LEAF_H__
 #define	__XFS_ATTR_LEAF_H__
 
-/*
- * Attribute storage layout, internal structure, access macros, etc.
- *
- * Attribute lists are structured around Btrees where all the data
- * elements are in the leaf nodes.  Attribute names are hashed into an int,
- * then that int is used as the index into the Btree.  Since the hashval
- * of an attribute name may not be unique, we may have duplicate keys.  The
- * internal links in the Btree are logical block offsets into the file.
- */
-
 struct attrlist;
 struct attrlist_cursor_kern;
 struct xfs_attr_list_context;
@@ -38,226 +28,6 @@ struct xfs_da_state_blk;
 struct xfs_inode;
 struct xfs_trans;
 
-/*========================================================================
- * Attribute structure when equal to XFS_LBSIZE(mp) bytes.
- *========================================================================*/
-
-/*
- * This is the structure of the leaf nodes in the Btree.
- *
- * Struct leaf_entry's are packed from the top.  Name/values grow from the
- * bottom but are not packed.  The freemap contains run-length-encoded entries
- * for the free bytes after the leaf_entry's, but only the N largest such,
- * smaller runs are dropped.  When the freemap doesn't show enough space
- * for an allocation, we compact the name/value area and try again.  If we
- * still don't have enough space, then we have to split the block.  The
- * name/value structs (both local and remote versions) must be 32bit aligned.
- *
- * Since we have duplicate hash keys, for each key that matches, compare
- * the actual name string.  The root and intermediate node search always
- * takes the first-in-the-block key match found, so we should only have
- * to work "forw"ard.  If none matches, continue with the "forw"ard leaf
- * nodes until the hash key changes or the attribute name is found.
- *
- * We store the fact that an attribute is a ROOT/USER/SECURE attribute in
- * the leaf_entry.  The namespaces are independent only because we also look
- * at the namespace bit when we are looking for a matching attribute name.
- *
- * We also store an "incomplete" bit in the leaf_entry.  It shows that an
- * attribute is in the middle of being created and should not be shown to
- * the user if we crash during the time that the bit is set.  We clear the
- * bit when we have finished setting up the attribute.  We do this because
- * we cannot create some large attributes inside a single transaction, and we
- * need some indication that we weren't finished if we crash in the middle.
- */
-#define XFS_ATTR_LEAF_MAPSIZE	3	/* how many freespace slots */
-
-typedef struct xfs_attr_leaf_map {	/* RLE map of free bytes */
-	__be16	base;			  /* base of free region */
-	__be16	size;			  /* length of free region */
-} xfs_attr_leaf_map_t;
-
-typedef struct xfs_attr_leaf_hdr {	/* constant-structure header block */
-	xfs_da_blkinfo_t info;		/* block type, links, etc. */
-	__be16	count;			/* count of active leaf_entry's */
-	__be16	usedbytes;		/* num bytes of names/values stored */
-	__be16	firstused;		/* first used byte in name area */
-	__u8	holes;			/* != 0 if blk needs compaction */
-	__u8	pad1;
-	xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE];
-					/* N largest free regions */
-} xfs_attr_leaf_hdr_t;
-
-typedef struct xfs_attr_leaf_entry {	/* sorted on key, not name */
-	__be32	hashval;		/* hash value of name */
-	__be16	nameidx;		/* index into buffer of name/value */
-	__u8	flags;			/* LOCAL/ROOT/SECURE/INCOMPLETE flag */
-	__u8	pad2;			/* unused pad byte */
-} xfs_attr_leaf_entry_t;
-
-typedef struct xfs_attr_leaf_name_local {
-	__be16	valuelen;		/* number of bytes in value */
-	__u8	namelen;		/* length of name bytes */
-	__u8	nameval[1];		/* name/value bytes */
-} xfs_attr_leaf_name_local_t;
-
-typedef struct xfs_attr_leaf_name_remote {
-	__be32	valueblk;		/* block number of value bytes */
-	__be32	valuelen;		/* number of bytes in value */
-	__u8	namelen;		/* length of name bytes */
-	__u8	name[1];		/* name bytes */
-} xfs_attr_leaf_name_remote_t;
-
-typedef struct xfs_attr_leafblock {
-	xfs_attr_leaf_hdr_t	hdr;	/* constant-structure header block */
-	xfs_attr_leaf_entry_t	entries[1];	/* sorted on key, not name */
-	xfs_attr_leaf_name_local_t namelist;	/* grows from bottom of buf */
-	xfs_attr_leaf_name_remote_t valuelist;	/* grows from bottom of buf */
-} xfs_attr_leafblock_t;
-
-/*
- * CRC enabled leaf structures. Called "version 3" structures to match the
- * version number of the directory and dablk structures for this feature, and
- * attr2 is already taken by the variable inode attribute fork size feature.
- */
-struct xfs_attr3_leaf_hdr {
-	struct xfs_da3_blkinfo	info;
-	__be16			count;
-	__be16			usedbytes;
-	__be16			firstused;
-	__u8			holes;
-	__u8			pad1;
-	struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE];
-	__be32			pad2;		/* 64 bit alignment */
-};
-
-#define XFS_ATTR3_LEAF_CRC_OFF	(offsetof(struct xfs_attr3_leaf_hdr, info.crc))
-
-struct xfs_attr3_leafblock {
-	struct xfs_attr3_leaf_hdr	hdr;
-	struct xfs_attr_leaf_entry	entries[1];
-
-	/*
-	 * The rest of the block contains the following structures after the
-	 * leaf entries, growing from the bottom up. The variables are never
-	 * referenced, the locations accessed purely from helper functions.
-	 *
-	 * struct xfs_attr_leaf_name_local
-	 * struct xfs_attr_leaf_name_remote
-	 */
-};
-
-/*
- * incore, neutral version of the attribute leaf header
- */
-struct xfs_attr3_icleaf_hdr {
-	__uint32_t	forw;
-	__uint32_t	back;
-	__uint16_t	magic;
-	__uint16_t	count;
-	__uint16_t	usedbytes;
-	__uint16_t	firstused;
-	__u8		holes;
-	struct {
-		__uint16_t	base;
-		__uint16_t	size;
-	} freemap[XFS_ATTR_LEAF_MAPSIZE];
-};
-
-/*
- * Flags used in the leaf_entry[i].flags field.
- * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
- * on the system call, they are "or"ed together for various operations.
- */
-#define	XFS_ATTR_LOCAL_BIT	0	/* attr is stored locally */
-#define	XFS_ATTR_ROOT_BIT	1	/* limit access to trusted attrs */
-#define	XFS_ATTR_SECURE_BIT	2	/* limit access to secure attrs */
-#define	XFS_ATTR_INCOMPLETE_BIT	7	/* attr in middle of create/delete */
-#define XFS_ATTR_LOCAL		(1 << XFS_ATTR_LOCAL_BIT)
-#define XFS_ATTR_ROOT		(1 << XFS_ATTR_ROOT_BIT)
-#define XFS_ATTR_SECURE		(1 << XFS_ATTR_SECURE_BIT)
-#define XFS_ATTR_INCOMPLETE	(1 << XFS_ATTR_INCOMPLETE_BIT)
-
-/*
- * Conversion macros for converting namespace bits from argument flags
- * to ondisk flags.
- */
-#define XFS_ATTR_NSP_ARGS_MASK		(ATTR_ROOT | ATTR_SECURE)
-#define XFS_ATTR_NSP_ONDISK_MASK	(XFS_ATTR_ROOT | XFS_ATTR_SECURE)
-#define XFS_ATTR_NSP_ONDISK(flags)	((flags) & XFS_ATTR_NSP_ONDISK_MASK)
-#define XFS_ATTR_NSP_ARGS(flags)	((flags) & XFS_ATTR_NSP_ARGS_MASK)
-#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x)	(((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\
-					 ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0))
-#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x)	(((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\
-					 ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0))
-
-/*
- * Alignment for namelist and valuelist entries (since they are mixed
- * there can be only one alignment value)
- */
-#define	XFS_ATTR_LEAF_NAME_ALIGN	((uint)sizeof(xfs_dablk_t))
-
-static inline int
-xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp)
-{
-	if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
-		return sizeof(struct xfs_attr3_leaf_hdr);
-	return sizeof(struct xfs_attr_leaf_hdr);
-}
-
-static inline struct xfs_attr_leaf_entry *
-xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp)
-{
-	if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
-		return &((struct xfs_attr3_leafblock *)leafp)->entries[0];
-	return &leafp->entries[0];
-}
-
-/*
- * Cast typed pointers for "local" and "remote" name/value structs.
- */
-static inline char *
-xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
-{
-	struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp);
-
-	return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)];
-}
-
-static inline xfs_attr_leaf_name_remote_t *
-xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
-{
-	return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx);
-}
-
-static inline xfs_attr_leaf_name_local_t *
-xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
-{
-	return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx);
-}
-
-/*
- * Calculate total bytes used (including trailing pad for alignment) for
- * a "local" name/value structure, a "remote" name/value structure, and
- * a pointer which might be either.
- */
-static inline int xfs_attr_leaf_entsize_remote(int nlen)
-{
-	return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
-		XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
-}
-
-static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
-{
-	return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
-		XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
-}
-
-static inline int xfs_attr_leaf_entsize_local_max(int bsize)
-{
-	return (((bsize) >> 1) + ((bsize) >> 2));
-}
-
 /*
  * Used to keep a list of "remote value" extents when unlinking an inode.
  */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index cbc80d485177..8ba8e4b5e310 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -25,6 +25,7 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index 712a502de619..578f21b70875 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -26,6 +26,7 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_error.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/xfs_attr_remote.h
index b3e1e1bdfb3b..5a9acfa156d7 100644
--- a/fs/xfs/xfs_attr_remote.h
+++ b/fs/xfs/xfs_attr_remote.h
@@ -18,33 +18,6 @@
 #ifndef __XFS_ATTR_REMOTE_H__
 #define	__XFS_ATTR_REMOTE_H__
 
-#define XFS_ATTR3_RMT_MAGIC	0x5841524d	/* XARM */
-
-/*
- * There is one of these headers per filesystem block in a remote attribute.
- * This is done to ensure there is a 1:1 mapping between the attribute value
- * length and the number of blocks needed to store the attribute. This makes the
- * verification of a buffer a little more complex, but greatly simplifies the
- * allocation, reading and writing of these attributes as we don't have to guess
- * the number of blocks needed to store the attribute data.
- */
-struct xfs_attr3_rmt_hdr {
-	__be32	rm_magic;
-	__be32	rm_offset;
-	__be32	rm_bytes;
-	__be32	rm_crc;
-	uuid_t	rm_uuid;
-	__be64	rm_owner;
-	__be64	rm_blkno;
-	__be64	rm_lsn;
-};
-
-#define XFS_ATTR3_RMT_CRC_OFF	offsetof(struct xfs_attr3_rmt_hdr, rm_crc)
-
-#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize)	\
-	((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
-			sizeof(struct xfs_attr3_rmt_hdr) : 0))
-
 int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
 
 int xfs_attr_rmtval_get(struct xfs_da_args *args);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 6870aad5d815..b5cb377bd2bb 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -26,8 +26,8 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_da_btree.h"
-#include "xfs_dir2_format.h"
 #include "xfs_dir2.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 4f246628a900..74ff478b9b75 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -27,7 +27,7 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_da_btree.h"
+#include "xfs_da_format.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index ac51abeb4782..4ec68c361c4c 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -26,9 +26,9 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir2_format.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index a762cfa4097d..e492dcadd032 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -24,146 +24,6 @@ struct xfs_inode;
 struct xfs_trans;
 struct zone;
 
-/*========================================================================
- * Directory Structure when greater than XFS_LBSIZE(mp) bytes.
- *========================================================================*/
-
-/*
- * This structure is common to both leaf nodes and non-leaf nodes in the Btree.
- *
- * It is used to manage a doubly linked list of all blocks at the same
- * level in the Btree, and to identify which type of block this is.
- */
-#define XFS_DA_NODE_MAGIC	0xfebe	/* magic number: non-leaf blocks */
-#define XFS_ATTR_LEAF_MAGIC	0xfbee	/* magic number: attribute leaf blks */
-#define	XFS_DIR2_LEAF1_MAGIC	0xd2f1	/* magic number: v2 dirlf single blks */
-#define	XFS_DIR2_LEAFN_MAGIC	0xd2ff	/* magic number: v2 dirlf multi blks */
-
-typedef struct xfs_da_blkinfo {
-	__be32		forw;			/* previous block in list */
-	__be32		back;			/* following block in list */
-	__be16		magic;			/* validity check on block */
-	__be16		pad;			/* unused */
-} xfs_da_blkinfo_t;
-
-/*
- * CRC enabled directory structure types
- *
- * The headers change size for the additional verification information, but
- * otherwise the tree layouts and contents are unchanged. Hence the da btree
- * code can use the struct xfs_da_blkinfo for manipulating the tree links and
- * magic numbers without modification for both v2 and v3 nodes.
- */
-#define XFS_DA3_NODE_MAGIC	0x3ebe	/* magic number: non-leaf blocks */
-#define XFS_ATTR3_LEAF_MAGIC	0x3bee	/* magic number: attribute leaf blks */
-#define	XFS_DIR3_LEAF1_MAGIC	0x3df1	/* magic number: v2 dirlf single blks */
-#define	XFS_DIR3_LEAFN_MAGIC	0x3dff	/* magic number: v2 dirlf multi blks */
-
-struct xfs_da3_blkinfo {
-	/*
-	 * the node link manipulation code relies on the fact that the first
-	 * element of this structure is the struct xfs_da_blkinfo so it can
-	 * ignore the differences in the rest of the structures.
-	 */
-	struct xfs_da_blkinfo	hdr;
-	__be32			crc;	/* CRC of block */
-	__be64			blkno;	/* first block of the buffer */
-	__be64			lsn;	/* sequence number of last write */
-	uuid_t			uuid;	/* filesystem we belong to */
-	__be64			owner;	/* inode that owns the block */
-};
-
-/*
- * This is the structure of the root and intermediate nodes in the Btree.
- * The leaf nodes are defined above.
- *
- * Entries are not packed.
- *
- * Since we have duplicate keys, use a binary search but always follow
- * all match in the block, not just the first match found.
- */
-#define	XFS_DA_NODE_MAXDEPTH	5	/* max depth of Btree */
-
-typedef struct xfs_da_node_hdr {
-	struct xfs_da_blkinfo	info;	/* block type, links, etc. */
-	__be16			__count; /* count of active entries */
-	__be16			__level; /* level above leaves (leaf == 0) */
-} xfs_da_node_hdr_t;
-
-struct xfs_da3_node_hdr {
-	struct xfs_da3_blkinfo	info;	/* block type, links, etc. */
-	__be16			__count; /* count of active entries */
-	__be16			__level; /* level above leaves (leaf == 0) */
-	__be32			__pad32;
-};
-
-#define XFS_DA3_NODE_CRC_OFF	(offsetof(struct xfs_da3_node_hdr, info.crc))
-
-typedef struct xfs_da_node_entry {
-	__be32	hashval;	/* hash value for this descendant */
-	__be32	before;		/* Btree block before this key */
-} xfs_da_node_entry_t;
-
-typedef struct xfs_da_intnode {
-	struct xfs_da_node_hdr	hdr;
-	struct xfs_da_node_entry __btree[];
-} xfs_da_intnode_t;
-
-struct xfs_da3_intnode {
-	struct xfs_da3_node_hdr	hdr;
-	struct xfs_da_node_entry __btree[];
-};
-
-/*
- * In-core version of the node header to abstract the differences in the v2 and
- * v3 disk format of the headers. Callers need to convert to/from disk format as
- * appropriate.
- */
-struct xfs_da3_icnode_hdr {
-	__uint32_t	forw;
-	__uint32_t	back;
-	__uint16_t	magic;
-	__uint16_t	count;
-	__uint16_t	level;
-};
-
-extern void xfs_da3_node_hdr_from_disk(struct xfs_da3_icnode_hdr *to,
-				       struct xfs_da_intnode *from);
-extern void xfs_da3_node_hdr_to_disk(struct xfs_da_intnode *to,
-				     struct xfs_da3_icnode_hdr *from);
-
-static inline int
-__xfs_da3_node_hdr_size(bool v3)
-{
-	if (v3)
-		return sizeof(struct xfs_da3_node_hdr);
-	return sizeof(struct xfs_da_node_hdr);
-}
-static inline int
-xfs_da3_node_hdr_size(struct xfs_da_intnode *dap)
-{
-	bool	v3 = dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC);
-
-	return __xfs_da3_node_hdr_size(v3);
-}
-
-static inline struct xfs_da_node_entry *
-xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
-{
-	if (dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
-		struct xfs_da3_intnode *dap3 = (struct xfs_da3_intnode *)dap;
-		return dap3->__btree;
-	}
-	return dap->__btree;
-}
-
-extern void xfs_da3_intnode_from_disk(struct xfs_da3_icnode_hdr *to,
-				      struct xfs_da_intnode *from);
-extern void xfs_da3_intnode_to_disk(struct xfs_da_intnode *to,
-				    struct xfs_da3_icnode_hdr *from);
-
-#define	XFS_LBSIZE(mp)	(mp)->m_sb.sb_blocksize
-
 /*========================================================================
  * Btree searching and modification structure definitions.
  *========================================================================*/
diff --git a/fs/xfs/xfs_da_format.h b/fs/xfs/xfs_da_format.h
new file mode 100644
index 000000000000..89a1a219c8ff
--- /dev/null
+++ b/fs/xfs/xfs_da_format.h
@@ -0,0 +1,1362 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DA_FORMAT_H__
+#define __XFS_DA_FORMAT_H__
+
+/*========================================================================
+ * Directory Structure when greater than XFS_LBSIZE(mp) bytes.
+ *========================================================================*/
+
+/*
+ * This structure is common to both leaf nodes and non-leaf nodes in the Btree.
+ *
+ * It is used to manage a doubly linked list of all blocks at the same
+ * level in the Btree, and to identify which type of block this is.
+ */
+#define XFS_DA_NODE_MAGIC	0xfebe	/* magic number: non-leaf blocks */
+#define XFS_ATTR_LEAF_MAGIC	0xfbee	/* magic number: attribute leaf blks */
+#define	XFS_DIR2_LEAF1_MAGIC	0xd2f1	/* magic number: v2 dirlf single blks */
+#define	XFS_DIR2_LEAFN_MAGIC	0xd2ff	/* magic number: v2 dirlf multi blks */
+
+typedef struct xfs_da_blkinfo {
+	__be32		forw;			/* previous block in list */
+	__be32		back;			/* following block in list */
+	__be16		magic;			/* validity check on block */
+	__be16		pad;			/* unused */
+} xfs_da_blkinfo_t;
+
+/*
+ * CRC enabled directory structure types
+ *
+ * The headers change size for the additional verification information, but
+ * otherwise the tree layouts and contents are unchanged. Hence the da btree
+ * code can use the struct xfs_da_blkinfo for manipulating the tree links and
+ * magic numbers without modification for both v2 and v3 nodes.
+ */
+#define XFS_DA3_NODE_MAGIC	0x3ebe	/* magic number: non-leaf blocks */
+#define XFS_ATTR3_LEAF_MAGIC	0x3bee	/* magic number: attribute leaf blks */
+#define	XFS_DIR3_LEAF1_MAGIC	0x3df1	/* magic number: v2 dirlf single blks */
+#define	XFS_DIR3_LEAFN_MAGIC	0x3dff	/* magic number: v2 dirlf multi blks */
+
+struct xfs_da3_blkinfo {
+	/*
+	 * the node link manipulation code relies on the fact that the first
+	 * element of this structure is the struct xfs_da_blkinfo so it can
+	 * ignore the differences in the rest of the structures.
+	 */
+	struct xfs_da_blkinfo	hdr;
+	__be32			crc;	/* CRC of block */
+	__be64			blkno;	/* first block of the buffer */
+	__be64			lsn;	/* sequence number of last write */
+	uuid_t			uuid;	/* filesystem we belong to */
+	__be64			owner;	/* inode that owns the block */
+};
+
+/*
+ * This is the structure of the root and intermediate nodes in the Btree.
+ * The leaf nodes are defined above.
+ *
+ * Entries are not packed.
+ *
+ * Since we have duplicate keys, use a binary search but always follow
+ * all match in the block, not just the first match found.
+ */
+#define	XFS_DA_NODE_MAXDEPTH	5	/* max depth of Btree */
+
+typedef struct xfs_da_node_hdr {
+	struct xfs_da_blkinfo	info;	/* block type, links, etc. */
+	__be16			__count; /* count of active entries */
+	__be16			__level; /* level above leaves (leaf == 0) */
+} xfs_da_node_hdr_t;
+
+struct xfs_da3_node_hdr {
+	struct xfs_da3_blkinfo	info;	/* block type, links, etc. */
+	__be16			__count; /* count of active entries */
+	__be16			__level; /* level above leaves (leaf == 0) */
+	__be32			__pad32;
+};
+
+#define XFS_DA3_NODE_CRC_OFF	(offsetof(struct xfs_da3_node_hdr, info.crc))
+
+typedef struct xfs_da_node_entry {
+	__be32	hashval;	/* hash value for this descendant */
+	__be32	before;		/* Btree block before this key */
+} xfs_da_node_entry_t;
+
+typedef struct xfs_da_intnode {
+	struct xfs_da_node_hdr	hdr;
+	struct xfs_da_node_entry __btree[];
+} xfs_da_intnode_t;
+
+struct xfs_da3_intnode {
+	struct xfs_da3_node_hdr	hdr;
+	struct xfs_da_node_entry __btree[];
+};
+
+/*
+ * In-core version of the node header to abstract the differences in the v2 and
+ * v3 disk format of the headers. Callers need to convert to/from disk format as
+ * appropriate.
+ */
+struct xfs_da3_icnode_hdr {
+	__uint32_t	forw;
+	__uint32_t	back;
+	__uint16_t	magic;
+	__uint16_t	count;
+	__uint16_t	level;
+};
+
+extern void xfs_da3_node_hdr_from_disk(struct xfs_da3_icnode_hdr *to,
+				       struct xfs_da_intnode *from);
+extern void xfs_da3_node_hdr_to_disk(struct xfs_da_intnode *to,
+				     struct xfs_da3_icnode_hdr *from);
+
+static inline int
+__xfs_da3_node_hdr_size(bool v3)
+{
+	if (v3)
+		return sizeof(struct xfs_da3_node_hdr);
+	return sizeof(struct xfs_da_node_hdr);
+}
+static inline int
+xfs_da3_node_hdr_size(struct xfs_da_intnode *dap)
+{
+	bool	v3 = dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC);
+
+	return __xfs_da3_node_hdr_size(v3);
+}
+
+static inline struct xfs_da_node_entry *
+xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
+{
+	if (dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
+		struct xfs_da3_intnode *dap3 = (struct xfs_da3_intnode *)dap;
+		return dap3->__btree;
+	}
+	return dap->__btree;
+}
+
+extern void xfs_da3_intnode_from_disk(struct xfs_da3_icnode_hdr *to,
+				      struct xfs_da_intnode *from);
+extern void xfs_da3_intnode_to_disk(struct xfs_da_intnode *to,
+				    struct xfs_da3_icnode_hdr *from);
+
+#define	XFS_LBSIZE(mp)	(mp)->m_sb.sb_blocksize
+
+/*
+ * Directory version 2.
+ *
+ * There are 4 possible formats:
+ *  - shortform - embedded into the inode
+ *  - single block - data with embedded leaf at the end
+ *  - multiple data blocks, single leaf+freeindex block
+ *  - data blocks, node and leaf blocks (btree), freeindex blocks
+ *
+ * Note: many node blocks structures and constants are shared with the attr
+ * code and defined in xfs_da_btree.h.
+ */
+
+#define	XFS_DIR2_BLOCK_MAGIC	0x58443242	/* XD2B: single block dirs */
+#define	XFS_DIR2_DATA_MAGIC	0x58443244	/* XD2D: multiblock dirs */
+#define	XFS_DIR2_FREE_MAGIC	0x58443246	/* XD2F: free index blocks */
+
+/*
+ * Directory Version 3 With CRCs.
+ *
+ * The tree formats are the same as for version 2 directories.  The difference
+ * is in the block header and dirent formats. In many cases the v3 structures
+ * use v2 definitions as they are no different and this makes code sharing much
+ * easier.
+ *
+ * Also, the xfs_dir3_*() functions handle both v2 and v3 formats - if the
+ * format is v2 then they switch to the existing v2 code, or the format is v3
+ * they implement the v3 functionality. This means the existing dir2 is a mix of
+ * xfs_dir2/xfs_dir3 calls and functions. The xfs_dir3 functions are called
+ * where there is a difference in the formats, otherwise the code is unchanged.
+ *
+ * Where it is possible, the code decides what to do based on the magic numbers
+ * in the blocks rather than feature bits in the superblock. This means the code
+ * is as independent of the external XFS code as possible as doesn't require
+ * passing struct xfs_mount pointers into places where it isn't really
+ * necessary.
+ *
+ * Version 3 includes:
+ *
+ *	- a larger block header for CRC and identification purposes and so the
+ *	offsets of all the structures inside the blocks are different.
+ *
+ *	- new magic numbers to be able to detect the v2/v3 types on the fly.
+ */
+
+#define	XFS_DIR3_BLOCK_MAGIC	0x58444233	/* XDB3: single block dirs */
+#define	XFS_DIR3_DATA_MAGIC	0x58444433	/* XDD3: multiblock dirs */
+#define	XFS_DIR3_FREE_MAGIC	0x58444633	/* XDF3: free index blocks */
+
+/*
+ * Dirents in version 3 directories have a file type field. Additions to this
+ * list are an on-disk format change, requiring feature bits. Valid values
+ * are as follows:
+ */
+#define XFS_DIR3_FT_UNKNOWN		0
+#define XFS_DIR3_FT_REG_FILE		1
+#define XFS_DIR3_FT_DIR			2
+#define XFS_DIR3_FT_CHRDEV		3
+#define XFS_DIR3_FT_BLKDEV		4
+#define XFS_DIR3_FT_FIFO		5
+#define XFS_DIR3_FT_SOCK		6
+#define XFS_DIR3_FT_SYMLINK		7
+#define XFS_DIR3_FT_WHT			8
+
+#define XFS_DIR3_FT_MAX			9
+
+/*
+ * Byte offset in data block and shortform entry.
+ */
+typedef	__uint16_t	xfs_dir2_data_off_t;
+#define	NULLDATAOFF	0xffffU
+typedef uint		xfs_dir2_data_aoff_t;	/* argument form */
+
+/*
+ * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
+ * Only need 16 bits, this is the byte offset into the single block form.
+ */
+typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
+
+/*
+ * Offset in data space of a data entry.
+ */
+typedef	__uint32_t	xfs_dir2_dataptr_t;
+#define	XFS_DIR2_MAX_DATAPTR	((xfs_dir2_dataptr_t)0xffffffff)
+#define	XFS_DIR2_NULL_DATAPTR	((xfs_dir2_dataptr_t)0)
+
+/*
+ * Byte offset in a directory.
+ */
+typedef	xfs_off_t	xfs_dir2_off_t;
+
+/*
+ * Directory block number (logical dirblk in file)
+ */
+typedef	__uint32_t	xfs_dir2_db_t;
+
+/*
+ * Inode number stored as 8 8-bit values.
+ */
+typedef	struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
+
+/*
+ * Inode number stored as 4 8-bit values.
+ * Works a lot of the time, when all the inode numbers in a directory
+ * fit in 32 bits.
+ */
+typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
+
+typedef union {
+	xfs_dir2_ino8_t	i8;
+	xfs_dir2_ino4_t	i4;
+} xfs_dir2_inou_t;
+#define	XFS_DIR2_MAX_SHORT_INUM	((xfs_ino_t)0xffffffffULL)
+
+/*
+ * Directory layout when stored internal to an inode.
+ *
+ * Small directories are packed as tightly as possible so as to fit into the
+ * literal area of the inode.  These "shortform" directories consist of a
+ * single xfs_dir2_sf_hdr header followed by zero or more xfs_dir2_sf_entry
+ * structures.  Due the different inode number storage size and the variable
+ * length name field in the xfs_dir2_sf_entry all these structure are
+ * variable length, and the accessors in this file should be used to iterate
+ * over them.
+ */
+typedef struct xfs_dir2_sf_hdr {
+	__uint8_t		count;		/* count of entries */
+	__uint8_t		i8count;	/* count of 8-byte inode #s */
+	xfs_dir2_inou_t		parent;		/* parent dir inode number */
+} __arch_pack xfs_dir2_sf_hdr_t;
+
+typedef struct xfs_dir2_sf_entry {
+	__u8			namelen;	/* actual name length */
+	xfs_dir2_sf_off_t	offset;		/* saved offset */
+	__u8			name[];		/* name, variable size */
+	/*
+	 * A single byte containing the file type field follows the inode
+	 * number for version 3 directory entries.
+	 *
+	 * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a
+	 * variable offset after the name.
+	 */
+} __arch_pack xfs_dir2_sf_entry_t;
+
+static inline int xfs_dir2_sf_hdr_size(int i8count)
+{
+	return sizeof(struct xfs_dir2_sf_hdr) -
+		(i8count == 0) *
+		(sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t));
+}
+
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
+{
+	return get_unaligned_be16(&sfep->offset.i);
+}
+
+static inline void
+xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
+{
+	put_unaligned_be16(off, &sfep->offset.i);
+}
+
+static inline struct xfs_dir2_sf_entry *
+xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
+{
+	return (struct xfs_dir2_sf_entry *)
+		((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count));
+}
+
+static inline int
+xfs_dir3_sf_entsize(
+	struct xfs_mount	*mp,
+	struct xfs_dir2_sf_hdr	*hdr,
+	int			len)
+{
+	int count = sizeof(struct xfs_dir2_sf_entry); 	/* namelen + offset */
+
+	count += len;					/* name */
+	count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
+				sizeof(xfs_dir2_ino4_t); /* ino # */
+	if (xfs_sb_version_hasftype(&mp->m_sb))
+		count += sizeof(__uint8_t);		/* file type */
+	return count;
+}
+
+static inline struct xfs_dir2_sf_entry *
+xfs_dir3_sf_nextentry(
+	struct xfs_mount	*mp,
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep)
+{
+	return (struct xfs_dir2_sf_entry *)
+		((char *)sfep + xfs_dir3_sf_entsize(mp, hdr, sfep->namelen));
+}
+
+/*
+ * in dir3 shortform directories, the file type field is stored at a variable
+ * offset after the inode number. Because it's only a single byte, endian
+ * conversion is not necessary.
+ */
+static inline __uint8_t *
+xfs_dir3_sfe_ftypep(
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep)
+{
+	return (__uint8_t *)&sfep->name[sfep->namelen];
+}
+
+static inline __uint8_t
+xfs_dir3_sfe_get_ftype(
+	struct xfs_mount	*mp,
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep)
+{
+	__uint8_t	*ftp;
+
+	if (!xfs_sb_version_hasftype(&mp->m_sb))
+		return XFS_DIR3_FT_UNKNOWN;
+
+	ftp = xfs_dir3_sfe_ftypep(hdr, sfep);
+	if (*ftp >= XFS_DIR3_FT_MAX)
+		return XFS_DIR3_FT_UNKNOWN;
+	return *ftp;
+}
+
+static inline void
+xfs_dir3_sfe_put_ftype(
+	struct xfs_mount	*mp,
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep,
+	__uint8_t		ftype)
+{
+	__uint8_t	*ftp;
+
+	ASSERT(ftype < XFS_DIR3_FT_MAX);
+
+	if (!xfs_sb_version_hasftype(&mp->m_sb))
+		return;
+	ftp = xfs_dir3_sfe_ftypep(hdr, sfep);
+	*ftp = ftype;
+}
+
+/*
+ * Data block structures.
+ *
+ * A pure data block looks like the following drawing on disk:
+ *
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_data_hdr_t                             |
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | ...                                             |
+ *    +-------------------------------------------------+
+ *    | unused space                                    |
+ *    +-------------------------------------------------+
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ *
+ * In addition to the pure data blocks for the data and node formats,
+ * most structures are also used for the combined data/freespace "block"
+ * format below.
+ */
+
+#define	XFS_DIR2_DATA_ALIGN_LOG	3		/* i.e., 8 bytes */
+#define	XFS_DIR2_DATA_ALIGN	(1 << XFS_DIR2_DATA_ALIGN_LOG)
+#define	XFS_DIR2_DATA_FREE_TAG	0xffff
+#define	XFS_DIR2_DATA_FD_COUNT	3
+
+/*
+ * Directory address space divided into sections,
+ * spaces separated by 32GB.
+ */
+#define	XFS_DIR2_SPACE_SIZE	(1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
+#define	XFS_DIR2_DATA_SPACE	0
+#define	XFS_DIR2_DATA_OFFSET	(XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
+#define	XFS_DIR2_DATA_FIRSTDB(mp)	\
+	xfs_dir2_byte_to_db(mp, XFS_DIR2_DATA_OFFSET)
+
+/*
+ * Describe a free area in the data block.
+ *
+ * The freespace will be formatted as a xfs_dir2_data_unused_t.
+ */
+typedef struct xfs_dir2_data_free {
+	__be16			offset;		/* start of freespace */
+	__be16			length;		/* length of freespace */
+} xfs_dir2_data_free_t;
+
+/*
+ * Header for the data blocks.
+ *
+ * The code knows that XFS_DIR2_DATA_FD_COUNT is 3.
+ */
+typedef struct xfs_dir2_data_hdr {
+	__be32			magic;		/* XFS_DIR2_DATA_MAGIC or */
+						/* XFS_DIR2_BLOCK_MAGIC */
+	xfs_dir2_data_free_t	bestfree[XFS_DIR2_DATA_FD_COUNT];
+} xfs_dir2_data_hdr_t;
+
+/*
+ * define a structure for all the verification fields we are adding to the
+ * directory block structures. This will be used in several structures.
+ * The magic number must be the first entry to align with all the dir2
+ * structures so we determine how to decode them just by the magic number.
+ */
+struct xfs_dir3_blk_hdr {
+	__be32			magic;	/* magic number */
+	__be32			crc;	/* CRC of block */
+	__be64			blkno;	/* first block of the buffer */
+	__be64			lsn;	/* sequence number of last write */
+	uuid_t			uuid;	/* filesystem we belong to */
+	__be64			owner;	/* inode that owns the block */
+};
+
+struct xfs_dir3_data_hdr {
+	struct xfs_dir3_blk_hdr	hdr;
+	xfs_dir2_data_free_t	best_free[XFS_DIR2_DATA_FD_COUNT];
+	__be32			pad;	/* 64 bit alignment */
+};
+
+#define XFS_DIR3_DATA_CRC_OFF  offsetof(struct xfs_dir3_data_hdr, hdr.crc)
+
+static inline struct xfs_dir2_data_free *
+xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
+{
+	if (hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+		struct xfs_dir3_data_hdr *hdr3 = (struct xfs_dir3_data_hdr *)hdr;
+		return hdr3->best_free;
+	}
+	return hdr->bestfree;
+}
+
+/*
+ * Active entry in a data block.
+ *
+ * Aligned to 8 bytes.  After the variable length name field there is a
+ * 2 byte tag field, which can be accessed using xfs_dir3_data_entry_tag_p.
+ *
+ * For dir3 structures, there is file type field between the name and the tag.
+ * This can only be manipulated by helper functions. It is packed hard against
+ * the end of the name so any padding for rounding is between the file type and
+ * the tag.
+ */
+typedef struct xfs_dir2_data_entry {
+	__be64			inumber;	/* inode number */
+	__u8			namelen;	/* name length */
+	__u8			name[];		/* name bytes, no null */
+     /* __u8			filetype; */	/* type of inode we point to */
+     /*	__be16                  tag; */		/* starting offset of us */
+} xfs_dir2_data_entry_t;
+
+/*
+ * Unused entry in a data block.
+ *
+ * Aligned to 8 bytes.  Tag appears as the last 2 bytes and must be accessed
+ * using xfs_dir2_data_unused_tag_p.
+ */
+typedef struct xfs_dir2_data_unused {
+	__be16			freetag;	/* XFS_DIR2_DATA_FREE_TAG */
+	__be16			length;		/* total free length */
+						/* variable offset */
+	__be16			tag;		/* starting offset of us */
+} xfs_dir2_data_unused_t;
+
+/*
+ * Size of a data entry.
+ */
+static inline int
+__xfs_dir3_data_entsize(
+	bool	ftype,
+	int	n)
+{
+	int	size = offsetof(struct xfs_dir2_data_entry, name[0]);
+
+	size += n;
+	size += sizeof(xfs_dir2_data_off_t);
+	if (ftype)
+		size += sizeof(__uint8_t);
+	return roundup(size, XFS_DIR2_DATA_ALIGN);
+}
+static inline int
+xfs_dir3_data_entsize(
+	struct xfs_mount	*mp,
+	int			n)
+{
+	bool ftype = xfs_sb_version_hasftype(&mp->m_sb) ? true : false;
+	return __xfs_dir3_data_entsize(ftype, n);
+}
+
+static inline __uint8_t
+xfs_dir3_dirent_get_ftype(
+	struct xfs_mount	*mp,
+	struct xfs_dir2_data_entry *dep)
+{
+	if (xfs_sb_version_hasftype(&mp->m_sb)) {
+		__uint8_t	type = dep->name[dep->namelen];
+
+		ASSERT(type < XFS_DIR3_FT_MAX);
+		if (type < XFS_DIR3_FT_MAX)
+			return type;
+
+	}
+	return XFS_DIR3_FT_UNKNOWN;
+}
+
+static inline void
+xfs_dir3_dirent_put_ftype(
+	struct xfs_mount	*mp,
+	struct xfs_dir2_data_entry *dep,
+	__uint8_t		type)
+{
+	ASSERT(type < XFS_DIR3_FT_MAX);
+	ASSERT(dep->namelen != 0);
+
+	if (xfs_sb_version_hasftype(&mp->m_sb))
+		dep->name[dep->namelen] = type;
+}
+
+/*
+ * Pointer to an entry's tag word.
+ */
+static inline __be16 *
+xfs_dir3_data_entry_tag_p(
+	struct xfs_mount	*mp,
+	struct xfs_dir2_data_entry *dep)
+{
+	return (__be16 *)((char *)dep +
+		xfs_dir3_data_entsize(mp, dep->namelen) - sizeof(__be16));
+}
+
+/*
+ * Pointer to a freespace's tag word.
+ */
+static inline __be16 *
+xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup)
+{
+	return (__be16 *)((char *)dup +
+			be16_to_cpu(dup->length) - sizeof(__be16));
+}
+
+static inline size_t
+xfs_dir3_data_hdr_size(bool dir3)
+{
+	if (dir3)
+		return sizeof(struct xfs_dir3_data_hdr);
+	return sizeof(struct xfs_dir2_data_hdr);
+}
+
+static inline size_t
+xfs_dir3_data_entry_offset(struct xfs_dir2_data_hdr *hdr)
+{
+	bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+		    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
+	return xfs_dir3_data_hdr_size(dir3);
+}
+
+static inline struct xfs_dir2_data_entry *
+xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + xfs_dir3_data_entry_offset(hdr));
+}
+
+static inline struct xfs_dir2_data_unused *
+xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_unused *)
+		((char *)hdr + xfs_dir3_data_entry_offset(hdr));
+}
+
+/*
+ * Offsets of . and .. in data space (always block 0)
+ *
+ * XXX: there is scope for significant optimisation of the logic here. Right
+ * now we are checking for "dir3 format" over and over again. Ideally we should
+ * only do it once for each operation.
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir3_data_dot_offset(struct xfs_mount *mp)
+{
+	return xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&mp->m_sb));
+}
+
+static inline xfs_dir2_data_aoff_t
+xfs_dir3_data_dotdot_offset(struct xfs_mount *mp)
+{
+	return xfs_dir3_data_dot_offset(mp) +
+		xfs_dir3_data_entsize(mp, 1);
+}
+
+static inline xfs_dir2_data_aoff_t
+xfs_dir3_data_first_offset(struct xfs_mount *mp)
+{
+	return xfs_dir3_data_dotdot_offset(mp) +
+		xfs_dir3_data_entsize(mp, 2);
+}
+
+/*
+ * location of . and .. in data space (always block 0)
+ */
+static inline struct xfs_dir2_data_entry *
+xfs_dir3_data_dot_entry_p(
+	struct xfs_mount	*mp,
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + xfs_dir3_data_dot_offset(mp));
+}
+
+static inline struct xfs_dir2_data_entry *
+xfs_dir3_data_dotdot_entry_p(
+	struct xfs_mount	*mp,
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + xfs_dir3_data_dotdot_offset(mp));
+}
+
+static inline struct xfs_dir2_data_entry *
+xfs_dir3_data_first_entry_p(
+	struct xfs_mount	*mp,
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + xfs_dir3_data_first_offset(mp));
+}
+
+/*
+ * Leaf block structures.
+ *
+ * A pure leaf block looks like the following drawing on disk:
+ *
+ *    +---------------------------+
+ *    | xfs_dir2_leaf_hdr_t       |
+ *    +---------------------------+
+ *    | xfs_dir2_leaf_entry_t     |
+ *    | xfs_dir2_leaf_entry_t     |
+ *    | xfs_dir2_leaf_entry_t     |
+ *    | xfs_dir2_leaf_entry_t     |
+ *    | ...                       |
+ *    +---------------------------+
+ *    | xfs_dir2_data_off_t       |
+ *    | xfs_dir2_data_off_t       |
+ *    | xfs_dir2_data_off_t       |
+ *    | ...                       |
+ *    +---------------------------+
+ *    | xfs_dir2_leaf_tail_t      |
+ *    +---------------------------+
+ *
+ * The xfs_dir2_data_off_t members (bests) and tail are at the end of the block
+ * for single-leaf (magic = XFS_DIR2_LEAF1_MAGIC) blocks only, but not present
+ * for directories with separate leaf nodes and free space blocks
+ * (magic = XFS_DIR2_LEAFN_MAGIC).
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ */
+
+/*
+ * Offset of the leaf/node space.  First block in this space
+ * is the btree root.
+ */
+#define	XFS_DIR2_LEAF_SPACE	1
+#define	XFS_DIR2_LEAF_OFFSET	(XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE)
+#define	XFS_DIR2_LEAF_FIRSTDB(mp)	\
+	xfs_dir2_byte_to_db(mp, XFS_DIR2_LEAF_OFFSET)
+
+/*
+ * Leaf block header.
+ */
+typedef struct xfs_dir2_leaf_hdr {
+	xfs_da_blkinfo_t	info;		/* header for da routines */
+	__be16			count;		/* count of entries */
+	__be16			stale;		/* count of stale entries */
+} xfs_dir2_leaf_hdr_t;
+
+struct xfs_dir3_leaf_hdr {
+	struct xfs_da3_blkinfo	info;		/* header for da routines */
+	__be16			count;		/* count of entries */
+	__be16			stale;		/* count of stale entries */
+	__be32			pad;		/* 64 bit alignment */
+};
+
+struct xfs_dir3_icleaf_hdr {
+	__uint32_t		forw;
+	__uint32_t		back;
+	__uint16_t		magic;
+	__uint16_t		count;
+	__uint16_t		stale;
+};
+
+/*
+ * Leaf block entry.
+ */
+typedef struct xfs_dir2_leaf_entry {
+	__be32			hashval;	/* hash value of name */
+	__be32			address;	/* address of data entry */
+} xfs_dir2_leaf_entry_t;
+
+/*
+ * Leaf block tail.
+ */
+typedef struct xfs_dir2_leaf_tail {
+	__be32			bestcount;
+} xfs_dir2_leaf_tail_t;
+
+/*
+ * Leaf block.
+ */
+typedef struct xfs_dir2_leaf {
+	xfs_dir2_leaf_hdr_t	hdr;			/* leaf header */
+	xfs_dir2_leaf_entry_t	__ents[];		/* entries */
+} xfs_dir2_leaf_t;
+
+struct xfs_dir3_leaf {
+	struct xfs_dir3_leaf_hdr	hdr;		/* leaf header */
+	struct xfs_dir2_leaf_entry	__ents[];	/* entries */
+};
+
+#define XFS_DIR3_LEAF_CRC_OFF  offsetof(struct xfs_dir3_leaf_hdr, info.crc)
+
+extern void xfs_dir3_leaf_hdr_from_disk(struct xfs_dir3_icleaf_hdr *to,
+					struct xfs_dir2_leaf *from);
+
+static inline int
+xfs_dir3_leaf_hdr_size(struct xfs_dir2_leaf *lp)
+{
+	if (lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+	    lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC))
+		return sizeof(struct xfs_dir3_leaf_hdr);
+	return sizeof(struct xfs_dir2_leaf_hdr);
+}
+
+static inline int
+xfs_dir3_max_leaf_ents(struct xfs_mount *mp, struct xfs_dir2_leaf *lp)
+{
+	return (mp->m_dirblksize - xfs_dir3_leaf_hdr_size(lp)) /
+		(uint)sizeof(struct xfs_dir2_leaf_entry);
+}
+
+/*
+ * Get address of the bestcount field in the single-leaf block.
+ */
+static inline struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp)
+{
+	if (lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+	    lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+		struct xfs_dir3_leaf *lp3 = (struct xfs_dir3_leaf *)lp;
+		return lp3->__ents;
+	}
+	return lp->__ents;
+}
+
+/*
+ * Get address of the bestcount field in the single-leaf block.
+ */
+static inline struct xfs_dir2_leaf_tail *
+xfs_dir2_leaf_tail_p(struct xfs_mount *mp, struct xfs_dir2_leaf *lp)
+{
+	return (struct xfs_dir2_leaf_tail *)
+		((char *)lp + mp->m_dirblksize -
+		  sizeof(struct xfs_dir2_leaf_tail));
+}
+
+/*
+ * Get address of the bests array in the single-leaf block.
+ */
+static inline __be16 *
+xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp)
+{
+	return (__be16 *)ltp - be32_to_cpu(ltp->bestcount);
+}
+
+/*
+ * DB blocks here are logical directory block numbers, not filesystem blocks.
+ */
+
+/*
+ * Convert dataptr to byte in file space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
+{
+	return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
+}
+
+/*
+ * Convert byte in file space to dataptr.  It had better be aligned.
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by)
+{
+	return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
+}
+
+/*
+ * Convert byte in space to (DB) block
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by)
+{
+	return (xfs_dir2_db_t)
+		(by >> (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog));
+}
+
+/*
+ * Convert dataptr to a block number
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
+{
+	return xfs_dir2_byte_to_db(mp, xfs_dir2_dataptr_to_byte(mp, dp));
+}
+
+/*
+ * Convert byte in space to offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by)
+{
+	return (xfs_dir2_data_aoff_t)(by &
+		((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) - 1));
+}
+
+/*
+ * Convert dataptr to a byte offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
+{
+	return xfs_dir2_byte_to_off(mp, xfs_dir2_dataptr_to_byte(mp, dp));
+}
+
+/*
+ * Convert block and offset to byte in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db,
+			xfs_dir2_data_aoff_t o)
+{
+	return ((xfs_dir2_off_t)db <<
+		(mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) + o;
+}
+
+/*
+ * Convert block (DB) to block (dablk)
+ */
+static inline xfs_dablk_t
+xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db)
+{
+	return (xfs_dablk_t)(db << mp->m_sb.sb_dirblklog);
+}
+
+/*
+ * Convert byte in space to (DA) block
+ */
+static inline xfs_dablk_t
+xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by)
+{
+	return xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, by));
+}
+
+/*
+ * Convert block and offset to dataptr
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db,
+			   xfs_dir2_data_aoff_t o)
+{
+	return xfs_dir2_byte_to_dataptr(mp, xfs_dir2_db_off_to_byte(mp, db, o));
+}
+
+/*
+ * Convert block (dablk) to block (DB)
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da)
+{
+	return (xfs_dir2_db_t)(da >> mp->m_sb.sb_dirblklog);
+}
+
+/*
+ * Convert block (dablk) to byte offset in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da)
+{
+	return xfs_dir2_db_off_to_byte(mp, xfs_dir2_da_to_db(mp, da), 0);
+}
+
+/*
+ * Free space block defintions for the node format.
+ */
+
+/*
+ * Offset of the freespace index.
+ */
+#define	XFS_DIR2_FREE_SPACE	2
+#define	XFS_DIR2_FREE_OFFSET	(XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE)
+#define	XFS_DIR2_FREE_FIRSTDB(mp)	\
+	xfs_dir2_byte_to_db(mp, XFS_DIR2_FREE_OFFSET)
+
+typedef	struct xfs_dir2_free_hdr {
+	__be32			magic;		/* XFS_DIR2_FREE_MAGIC */
+	__be32			firstdb;	/* db of first entry */
+	__be32			nvalid;		/* count of valid entries */
+	__be32			nused;		/* count of used entries */
+} xfs_dir2_free_hdr_t;
+
+typedef struct xfs_dir2_free {
+	xfs_dir2_free_hdr_t	hdr;		/* block header */
+	__be16			bests[];	/* best free counts */
+						/* unused entries are -1 */
+} xfs_dir2_free_t;
+
+struct xfs_dir3_free_hdr {
+	struct xfs_dir3_blk_hdr	hdr;
+	__be32			firstdb;	/* db of first entry */
+	__be32			nvalid;		/* count of valid entries */
+	__be32			nused;		/* count of used entries */
+	__be32			pad;		/* 64 bit alignment */
+};
+
+struct xfs_dir3_free {
+	struct xfs_dir3_free_hdr hdr;
+	__be16			bests[];	/* best free counts */
+						/* unused entries are -1 */
+};
+
+#define XFS_DIR3_FREE_CRC_OFF  offsetof(struct xfs_dir3_free, hdr.hdr.crc)
+
+/*
+ * In core version of the free block header, abstracted away from on-disk format
+ * differences. Use this in the code, and convert to/from the disk version using
+ * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk.
+ */
+struct xfs_dir3_icfree_hdr {
+	__uint32_t	magic;
+	__uint32_t	firstdb;
+	__uint32_t	nvalid;
+	__uint32_t	nused;
+
+};
+
+void xfs_dir3_free_hdr_from_disk(struct xfs_dir3_icfree_hdr *to,
+				 struct xfs_dir2_free *from);
+
+static inline int
+xfs_dir3_free_hdr_size(struct xfs_mount *mp)
+{
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		return sizeof(struct xfs_dir3_free_hdr);
+	return sizeof(struct xfs_dir2_free_hdr);
+}
+
+static inline int
+xfs_dir3_free_max_bests(struct xfs_mount *mp)
+{
+	return (mp->m_dirblksize - xfs_dir3_free_hdr_size(mp)) /
+		sizeof(xfs_dir2_data_off_t);
+}
+
+static inline __be16 *
+xfs_dir3_free_bests_p(struct xfs_mount *mp, struct xfs_dir2_free *free)
+{
+	return (__be16 *)((char *)free + xfs_dir3_free_hdr_size(mp));
+}
+
+/*
+ * Convert data space db to the corresponding free db.
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db)
+{
+	return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir3_free_max_bests(mp);
+}
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static inline int
+xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
+{
+	return db % xfs_dir3_free_max_bests(mp);
+}
+
+/*
+ * Single block format.
+ *
+ * The single block format looks like the following drawing on disk:
+ *
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_data_hdr_t                             |
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t :
+ *    | ...                                             |
+ *    +-------------------------------------------------+
+ *    | unused space                                    |
+ *    +-------------------------------------------------+
+ *    | ...                                             |
+ *    | xfs_dir2_leaf_entry_t                           |
+ *    | xfs_dir2_leaf_entry_t                           |
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_block_tail_t                           |
+ *    +-------------------------------------------------+
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ */
+
+typedef struct xfs_dir2_block_tail {
+	__be32		count;			/* count of leaf entries */
+	__be32		stale;			/* count of stale lf entries */
+} xfs_dir2_block_tail_t;
+
+/*
+ * Pointer to the leaf header embedded in a data block (1-block format)
+ */
+static inline struct xfs_dir2_block_tail *
+xfs_dir2_block_tail_p(struct xfs_mount *mp, struct xfs_dir2_data_hdr *hdr)
+{
+	return ((struct xfs_dir2_block_tail *)
+		((char *)hdr + mp->m_dirblksize)) - 1;
+}
+
+/*
+ * Pointer to the leaf entries embedded in a data block (1-block format)
+ */
+static inline struct xfs_dir2_leaf_entry *
+xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
+{
+	return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count);
+}
+
+
+/*
+ * Attribute storage layout
+ *
+ * Attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes.  Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree.  Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys.  The
+ * internal links in the Btree are logical block offsets into the file.
+ *
+ *========================================================================
+ * Attribute structure when equal to XFS_LBSIZE(mp) bytes.
+ *========================================================================
+ *
+ * Struct leaf_entry's are packed from the top.  Name/values grow from the
+ * bottom but are not packed.  The freemap contains run-length-encoded entries
+ * for the free bytes after the leaf_entry's, but only the N largest such,
+ * smaller runs are dropped.  When the freemap doesn't show enough space
+ * for an allocation, we compact the name/value area and try again.  If we
+ * still don't have enough space, then we have to split the block.  The
+ * name/value structs (both local and remote versions) must be 32bit aligned.
+ *
+ * Since we have duplicate hash keys, for each key that matches, compare
+ * the actual name string.  The root and intermediate node search always
+ * takes the first-in-the-block key match found, so we should only have
+ * to work "forw"ard.  If none matches, continue with the "forw"ard leaf
+ * nodes until the hash key changes or the attribute name is found.
+ *
+ * We store the fact that an attribute is a ROOT/USER/SECURE attribute in
+ * the leaf_entry.  The namespaces are independent only because we also look
+ * at the namespace bit when we are looking for a matching attribute name.
+ *
+ * We also store an "incomplete" bit in the leaf_entry.  It shows that an
+ * attribute is in the middle of being created and should not be shown to
+ * the user if we crash during the time that the bit is set.  We clear the
+ * bit when we have finished setting up the attribute.  We do this because
+ * we cannot create some large attributes inside a single transaction, and we
+ * need some indication that we weren't finished if we crash in the middle.
+ */
+#define XFS_ATTR_LEAF_MAPSIZE	3	/* how many freespace slots */
+
+typedef struct xfs_attr_leaf_map {	/* RLE map of free bytes */
+	__be16	base;			  /* base of free region */
+	__be16	size;			  /* length of free region */
+} xfs_attr_leaf_map_t;
+
+typedef struct xfs_attr_leaf_hdr {	/* constant-structure header block */
+	xfs_da_blkinfo_t info;		/* block type, links, etc. */
+	__be16	count;			/* count of active leaf_entry's */
+	__be16	usedbytes;		/* num bytes of names/values stored */
+	__be16	firstused;		/* first used byte in name area */
+	__u8	holes;			/* != 0 if blk needs compaction */
+	__u8	pad1;
+	xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE];
+					/* N largest free regions */
+} xfs_attr_leaf_hdr_t;
+
+typedef struct xfs_attr_leaf_entry {	/* sorted on key, not name */
+	__be32	hashval;		/* hash value of name */
+	__be16	nameidx;		/* index into buffer of name/value */
+	__u8	flags;			/* LOCAL/ROOT/SECURE/INCOMPLETE flag */
+	__u8	pad2;			/* unused pad byte */
+} xfs_attr_leaf_entry_t;
+
+typedef struct xfs_attr_leaf_name_local {
+	__be16	valuelen;		/* number of bytes in value */
+	__u8	namelen;		/* length of name bytes */
+	__u8	nameval[1];		/* name/value bytes */
+} xfs_attr_leaf_name_local_t;
+
+typedef struct xfs_attr_leaf_name_remote {
+	__be32	valueblk;		/* block number of value bytes */
+	__be32	valuelen;		/* number of bytes in value */
+	__u8	namelen;		/* length of name bytes */
+	__u8	name[1];		/* name bytes */
+} xfs_attr_leaf_name_remote_t;
+
+typedef struct xfs_attr_leafblock {
+	xfs_attr_leaf_hdr_t	hdr;	/* constant-structure header block */
+	xfs_attr_leaf_entry_t	entries[1];	/* sorted on key, not name */
+	xfs_attr_leaf_name_local_t namelist;	/* grows from bottom of buf */
+	xfs_attr_leaf_name_remote_t valuelist;	/* grows from bottom of buf */
+} xfs_attr_leafblock_t;
+
+/*
+ * CRC enabled leaf structures. Called "version 3" structures to match the
+ * version number of the directory and dablk structures for this feature, and
+ * attr2 is already taken by the variable inode attribute fork size feature.
+ */
+struct xfs_attr3_leaf_hdr {
+	struct xfs_da3_blkinfo	info;
+	__be16			count;
+	__be16			usedbytes;
+	__be16			firstused;
+	__u8			holes;
+	__u8			pad1;
+	struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE];
+	__be32			pad2;		/* 64 bit alignment */
+};
+
+#define XFS_ATTR3_LEAF_CRC_OFF	(offsetof(struct xfs_attr3_leaf_hdr, info.crc))
+
+struct xfs_attr3_leafblock {
+	struct xfs_attr3_leaf_hdr	hdr;
+	struct xfs_attr_leaf_entry	entries[1];
+
+	/*
+	 * The rest of the block contains the following structures after the
+	 * leaf entries, growing from the bottom up. The variables are never
+	 * referenced, the locations accessed purely from helper functions.
+	 *
+	 * struct xfs_attr_leaf_name_local
+	 * struct xfs_attr_leaf_name_remote
+	 */
+};
+
+/*
+ * incore, neutral version of the attribute leaf header
+ */
+struct xfs_attr3_icleaf_hdr {
+	__uint32_t	forw;
+	__uint32_t	back;
+	__uint16_t	magic;
+	__uint16_t	count;
+	__uint16_t	usedbytes;
+	__uint16_t	firstused;
+	__u8		holes;
+	struct {
+		__uint16_t	base;
+		__uint16_t	size;
+	} freemap[XFS_ATTR_LEAF_MAPSIZE];
+};
+
+/*
+ * Flags used in the leaf_entry[i].flags field.
+ * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
+ * on the system call, they are "or"ed together for various operations.
+ */
+#define	XFS_ATTR_LOCAL_BIT	0	/* attr is stored locally */
+#define	XFS_ATTR_ROOT_BIT	1	/* limit access to trusted attrs */
+#define	XFS_ATTR_SECURE_BIT	2	/* limit access to secure attrs */
+#define	XFS_ATTR_INCOMPLETE_BIT	7	/* attr in middle of create/delete */
+#define XFS_ATTR_LOCAL		(1 << XFS_ATTR_LOCAL_BIT)
+#define XFS_ATTR_ROOT		(1 << XFS_ATTR_ROOT_BIT)
+#define XFS_ATTR_SECURE		(1 << XFS_ATTR_SECURE_BIT)
+#define XFS_ATTR_INCOMPLETE	(1 << XFS_ATTR_INCOMPLETE_BIT)
+
+/*
+ * Conversion macros for converting namespace bits from argument flags
+ * to ondisk flags.
+ */
+#define XFS_ATTR_NSP_ARGS_MASK		(ATTR_ROOT | ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK_MASK	(XFS_ATTR_ROOT | XFS_ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK(flags)	((flags) & XFS_ATTR_NSP_ONDISK_MASK)
+#define XFS_ATTR_NSP_ARGS(flags)	((flags) & XFS_ATTR_NSP_ARGS_MASK)
+#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x)	(((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\
+					 ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0))
+#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x)	(((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\
+					 ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0))
+
+/*
+ * Alignment for namelist and valuelist entries (since they are mixed
+ * there can be only one alignment value)
+ */
+#define	XFS_ATTR_LEAF_NAME_ALIGN	((uint)sizeof(xfs_dablk_t))
+
+static inline int
+xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp)
+{
+	if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+		return sizeof(struct xfs_attr3_leaf_hdr);
+	return sizeof(struct xfs_attr_leaf_hdr);
+}
+
+static inline struct xfs_attr_leaf_entry *
+xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp)
+{
+	if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+		return &((struct xfs_attr3_leafblock *)leafp)->entries[0];
+	return &leafp->entries[0];
+}
+
+/*
+ * Cast typed pointers for "local" and "remote" name/value structs.
+ */
+static inline char *
+xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
+{
+	struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp);
+
+	return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)];
+}
+
+static inline xfs_attr_leaf_name_remote_t *
+xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
+{
+	return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx);
+}
+
+static inline xfs_attr_leaf_name_local_t *
+xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
+{
+	return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx);
+}
+
+/*
+ * Calculate total bytes used (including trailing pad for alignment) for
+ * a "local" name/value structure, a "remote" name/value structure, and
+ * a pointer which might be either.
+ */
+static inline int xfs_attr_leaf_entsize_remote(int nlen)
+{
+	return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
+		XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+}
+
+static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
+{
+	return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
+		XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+}
+
+static inline int xfs_attr_leaf_entsize_local_max(int bsize)
+{
+	return (((bsize) >> 1) + ((bsize) >> 2));
+}
+
+
+
+/*
+ * Remote attribute block format definition
+ *
+ * There is one of these headers per filesystem block in a remote attribute.
+ * This is done to ensure there is a 1:1 mapping between the attribute value
+ * length and the number of blocks needed to store the attribute. This makes the
+ * verification of a buffer a little more complex, but greatly simplifies the
+ * allocation, reading and writing of these attributes as we don't have to guess
+ * the number of blocks needed to store the attribute data.
+ */
+#define XFS_ATTR3_RMT_MAGIC	0x5841524d	/* XARM */
+
+struct xfs_attr3_rmt_hdr {
+	__be32	rm_magic;
+	__be32	rm_offset;
+	__be32	rm_bytes;
+	__be32	rm_crc;
+	uuid_t	rm_uuid;
+	__be64	rm_owner;
+	__be64	rm_blkno;
+	__be64	rm_lsn;
+};
+
+#define XFS_ATTR3_RMT_CRC_OFF	offsetof(struct xfs_attr3_rmt_hdr, rm_crc)
+
+#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize)	\
+	((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
+			sizeof(struct xfs_attr3_rmt_hdr) : 0))
+
+#endif /* __XFS_DA_FORMAT_H__ */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index edf203ab50af..f46777fa3077 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -24,6 +24,7 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
@@ -31,7 +32,6 @@
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
-#include "xfs_dir2_format.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 12dad188939d..5f172730bfa3 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -24,6 +24,7 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dinode.h"
@@ -31,7 +32,6 @@
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_buf_item.h"
-#include "xfs_dir2_format.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 47e1326c169a..8cf8034ebab1 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -24,11 +24,11 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_dir2_format.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h
deleted file mode 100644
index 9cf67381adf6..000000000000
--- a/fs/xfs/xfs_dir2_format.h
+++ /dev/null
@@ -1,964 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DIR2_FORMAT_H__
-#define __XFS_DIR2_FORMAT_H__
-
-/*
- * Directory version 2.
- *
- * There are 4 possible formats:
- *  - shortform - embedded into the inode
- *  - single block - data with embedded leaf at the end
- *  - multiple data blocks, single leaf+freeindex block
- *  - data blocks, node and leaf blocks (btree), freeindex blocks
- *
- * Note: many node blocks structures and constants are shared with the attr
- * code and defined in xfs_da_btree.h.
- */
-
-#define	XFS_DIR2_BLOCK_MAGIC	0x58443242	/* XD2B: single block dirs */
-#define	XFS_DIR2_DATA_MAGIC	0x58443244	/* XD2D: multiblock dirs */
-#define	XFS_DIR2_FREE_MAGIC	0x58443246	/* XD2F: free index blocks */
-
-/*
- * Directory Version 3 With CRCs.
- *
- * The tree formats are the same as for version 2 directories.  The difference
- * is in the block header and dirent formats. In many cases the v3 structures
- * use v2 definitions as they are no different and this makes code sharing much
- * easier.
- *
- * Also, the xfs_dir3_*() functions handle both v2 and v3 formats - if the
- * format is v2 then they switch to the existing v2 code, or the format is v3
- * they implement the v3 functionality. This means the existing dir2 is a mix of
- * xfs_dir2/xfs_dir3 calls and functions. The xfs_dir3 functions are called
- * where there is a difference in the formats, otherwise the code is unchanged.
- *
- * Where it is possible, the code decides what to do based on the magic numbers
- * in the blocks rather than feature bits in the superblock. This means the code
- * is as independent of the external XFS code as possible as doesn't require
- * passing struct xfs_mount pointers into places where it isn't really
- * necessary.
- *
- * Version 3 includes:
- *
- *	- a larger block header for CRC and identification purposes and so the
- *	offsets of all the structures inside the blocks are different.
- *
- *	- new magic numbers to be able to detect the v2/v3 types on the fly.
- */
-
-#define	XFS_DIR3_BLOCK_MAGIC	0x58444233	/* XDB3: single block dirs */
-#define	XFS_DIR3_DATA_MAGIC	0x58444433	/* XDD3: multiblock dirs */
-#define	XFS_DIR3_FREE_MAGIC	0x58444633	/* XDF3: free index blocks */
-
-/*
- * Dirents in version 3 directories have a file type field. Additions to this
- * list are an on-disk format change, requiring feature bits. Valid values
- * are as follows:
- */
-#define XFS_DIR3_FT_UNKNOWN		0
-#define XFS_DIR3_FT_REG_FILE		1
-#define XFS_DIR3_FT_DIR			2
-#define XFS_DIR3_FT_CHRDEV		3
-#define XFS_DIR3_FT_BLKDEV		4
-#define XFS_DIR3_FT_FIFO		5
-#define XFS_DIR3_FT_SOCK		6
-#define XFS_DIR3_FT_SYMLINK		7
-#define XFS_DIR3_FT_WHT			8
-
-#define XFS_DIR3_FT_MAX			9
-
-/*
- * Byte offset in data block and shortform entry.
- */
-typedef	__uint16_t	xfs_dir2_data_off_t;
-#define	NULLDATAOFF	0xffffU
-typedef uint		xfs_dir2_data_aoff_t;	/* argument form */
-
-/*
- * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
- * Only need 16 bits, this is the byte offset into the single block form.
- */
-typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
-
-/*
- * Offset in data space of a data entry.
- */
-typedef	__uint32_t	xfs_dir2_dataptr_t;
-#define	XFS_DIR2_MAX_DATAPTR	((xfs_dir2_dataptr_t)0xffffffff)
-#define	XFS_DIR2_NULL_DATAPTR	((xfs_dir2_dataptr_t)0)
-
-/*
- * Byte offset in a directory.
- */
-typedef	xfs_off_t	xfs_dir2_off_t;
-
-/*
- * Directory block number (logical dirblk in file)
- */
-typedef	__uint32_t	xfs_dir2_db_t;
-
-/*
- * Inode number stored as 8 8-bit values.
- */
-typedef	struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
-
-/*
- * Inode number stored as 4 8-bit values.
- * Works a lot of the time, when all the inode numbers in a directory
- * fit in 32 bits.
- */
-typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
-
-typedef union {
-	xfs_dir2_ino8_t	i8;
-	xfs_dir2_ino4_t	i4;
-} xfs_dir2_inou_t;
-#define	XFS_DIR2_MAX_SHORT_INUM	((xfs_ino_t)0xffffffffULL)
-
-/*
- * Directory layout when stored internal to an inode.
- *
- * Small directories are packed as tightly as possible so as to fit into the
- * literal area of the inode.  These "shortform" directories consist of a
- * single xfs_dir2_sf_hdr header followed by zero or more xfs_dir2_sf_entry
- * structures.  Due the different inode number storage size and the variable
- * length name field in the xfs_dir2_sf_entry all these structure are
- * variable length, and the accessors in this file should be used to iterate
- * over them.
- */
-typedef struct xfs_dir2_sf_hdr {
-	__uint8_t		count;		/* count of entries */
-	__uint8_t		i8count;	/* count of 8-byte inode #s */
-	xfs_dir2_inou_t		parent;		/* parent dir inode number */
-} __arch_pack xfs_dir2_sf_hdr_t;
-
-typedef struct xfs_dir2_sf_entry {
-	__u8			namelen;	/* actual name length */
-	xfs_dir2_sf_off_t	offset;		/* saved offset */
-	__u8			name[];		/* name, variable size */
-	/*
-	 * A single byte containing the file type field follows the inode
-	 * number for version 3 directory entries.
-	 *
-	 * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a
-	 * variable offset after the name.
-	 */
-} __arch_pack xfs_dir2_sf_entry_t;
-
-static inline int xfs_dir2_sf_hdr_size(int i8count)
-{
-	return sizeof(struct xfs_dir2_sf_hdr) -
-		(i8count == 0) *
-		(sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t));
-}
-
-static inline xfs_dir2_data_aoff_t
-xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
-{
-	return get_unaligned_be16(&sfep->offset.i);
-}
-
-static inline void
-xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
-{
-	put_unaligned_be16(off, &sfep->offset.i);
-}
-
-static inline struct xfs_dir2_sf_entry *
-xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
-{
-	return (struct xfs_dir2_sf_entry *)
-		((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count));
-}
-
-static inline int
-xfs_dir3_sf_entsize(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_sf_hdr	*hdr,
-	int			len)
-{
-	int count = sizeof(struct xfs_dir2_sf_entry); 	/* namelen + offset */
-
-	count += len;					/* name */
-	count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
-				sizeof(xfs_dir2_ino4_t); /* ino # */
-	if (xfs_sb_version_hasftype(&mp->m_sb))
-		count += sizeof(__uint8_t);		/* file type */
-	return count;
-}
-
-static inline struct xfs_dir2_sf_entry *
-xfs_dir3_sf_nextentry(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_sf_hdr	*hdr,
-	struct xfs_dir2_sf_entry *sfep)
-{
-	return (struct xfs_dir2_sf_entry *)
-		((char *)sfep + xfs_dir3_sf_entsize(mp, hdr, sfep->namelen));
-}
-
-/*
- * in dir3 shortform directories, the file type field is stored at a variable
- * offset after the inode number. Because it's only a single byte, endian
- * conversion is not necessary.
- */
-static inline __uint8_t *
-xfs_dir3_sfe_ftypep(
-	struct xfs_dir2_sf_hdr	*hdr,
-	struct xfs_dir2_sf_entry *sfep)
-{
-	return (__uint8_t *)&sfep->name[sfep->namelen];
-}
-
-static inline __uint8_t
-xfs_dir3_sfe_get_ftype(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_sf_hdr	*hdr,
-	struct xfs_dir2_sf_entry *sfep)
-{
-	__uint8_t	*ftp;
-
-	if (!xfs_sb_version_hasftype(&mp->m_sb))
-		return XFS_DIR3_FT_UNKNOWN;
-
-	ftp = xfs_dir3_sfe_ftypep(hdr, sfep);
-	if (*ftp >= XFS_DIR3_FT_MAX)
-		return XFS_DIR3_FT_UNKNOWN;
-	return *ftp;
-}
-
-static inline void
-xfs_dir3_sfe_put_ftype(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_sf_hdr	*hdr,
-	struct xfs_dir2_sf_entry *sfep,
-	__uint8_t		ftype)
-{
-	__uint8_t	*ftp;
-
-	ASSERT(ftype < XFS_DIR3_FT_MAX);
-
-	if (!xfs_sb_version_hasftype(&mp->m_sb))
-		return;
-	ftp = xfs_dir3_sfe_ftypep(hdr, sfep);
-	*ftp = ftype;
-}
-
-/*
- * Data block structures.
- *
- * A pure data block looks like the following drawing on disk:
- *
- *    +-------------------------------------------------+
- *    | xfs_dir2_data_hdr_t                             |
- *    +-------------------------------------------------+
- *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
- *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
- *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
- *    | ...                                             |
- *    +-------------------------------------------------+
- *    | unused space                                    |
- *    +-------------------------------------------------+
- *
- * As all the entries are variable size structures the accessors below should
- * be used to iterate over them.
- *
- * In addition to the pure data blocks for the data and node formats,
- * most structures are also used for the combined data/freespace "block"
- * format below.
- */
-
-#define	XFS_DIR2_DATA_ALIGN_LOG	3		/* i.e., 8 bytes */
-#define	XFS_DIR2_DATA_ALIGN	(1 << XFS_DIR2_DATA_ALIGN_LOG)
-#define	XFS_DIR2_DATA_FREE_TAG	0xffff
-#define	XFS_DIR2_DATA_FD_COUNT	3
-
-/*
- * Directory address space divided into sections,
- * spaces separated by 32GB.
- */
-#define	XFS_DIR2_SPACE_SIZE	(1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
-#define	XFS_DIR2_DATA_SPACE	0
-#define	XFS_DIR2_DATA_OFFSET	(XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
-#define	XFS_DIR2_DATA_FIRSTDB(mp)	\
-	xfs_dir2_byte_to_db(mp, XFS_DIR2_DATA_OFFSET)
-
-/*
- * Describe a free area in the data block.
- *
- * The freespace will be formatted as a xfs_dir2_data_unused_t.
- */
-typedef struct xfs_dir2_data_free {
-	__be16			offset;		/* start of freespace */
-	__be16			length;		/* length of freespace */
-} xfs_dir2_data_free_t;
-
-/*
- * Header for the data blocks.
- *
- * The code knows that XFS_DIR2_DATA_FD_COUNT is 3.
- */
-typedef struct xfs_dir2_data_hdr {
-	__be32			magic;		/* XFS_DIR2_DATA_MAGIC or */
-						/* XFS_DIR2_BLOCK_MAGIC */
-	xfs_dir2_data_free_t	bestfree[XFS_DIR2_DATA_FD_COUNT];
-} xfs_dir2_data_hdr_t;
-
-/*
- * define a structure for all the verification fields we are adding to the
- * directory block structures. This will be used in several structures.
- * The magic number must be the first entry to align with all the dir2
- * structures so we determine how to decode them just by the magic number.
- */
-struct xfs_dir3_blk_hdr {
-	__be32			magic;	/* magic number */
-	__be32			crc;	/* CRC of block */
-	__be64			blkno;	/* first block of the buffer */
-	__be64			lsn;	/* sequence number of last write */
-	uuid_t			uuid;	/* filesystem we belong to */
-	__be64			owner;	/* inode that owns the block */
-};
-
-struct xfs_dir3_data_hdr {
-	struct xfs_dir3_blk_hdr	hdr;
-	xfs_dir2_data_free_t	best_free[XFS_DIR2_DATA_FD_COUNT];
-	__be32			pad;	/* 64 bit alignment */
-};
-
-#define XFS_DIR3_DATA_CRC_OFF  offsetof(struct xfs_dir3_data_hdr, hdr.crc)
-
-static inline struct xfs_dir2_data_free *
-xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
-{
-	if (hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-	    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
-		struct xfs_dir3_data_hdr *hdr3 = (struct xfs_dir3_data_hdr *)hdr;
-		return hdr3->best_free;
-	}
-	return hdr->bestfree;
-}
-
-/*
- * Active entry in a data block.
- *
- * Aligned to 8 bytes.  After the variable length name field there is a
- * 2 byte tag field, which can be accessed using xfs_dir3_data_entry_tag_p.
- *
- * For dir3 structures, there is file type field between the name and the tag.
- * This can only be manipulated by helper functions. It is packed hard against
- * the end of the name so any padding for rounding is between the file type and
- * the tag.
- */
-typedef struct xfs_dir2_data_entry {
-	__be64			inumber;	/* inode number */
-	__u8			namelen;	/* name length */
-	__u8			name[];		/* name bytes, no null */
-     /* __u8			filetype; */	/* type of inode we point to */
-     /*	__be16                  tag; */		/* starting offset of us */
-} xfs_dir2_data_entry_t;
-
-/*
- * Unused entry in a data block.
- *
- * Aligned to 8 bytes.  Tag appears as the last 2 bytes and must be accessed
- * using xfs_dir2_data_unused_tag_p.
- */
-typedef struct xfs_dir2_data_unused {
-	__be16			freetag;	/* XFS_DIR2_DATA_FREE_TAG */
-	__be16			length;		/* total free length */
-						/* variable offset */
-	__be16			tag;		/* starting offset of us */
-} xfs_dir2_data_unused_t;
-
-/*
- * Size of a data entry.
- */
-static inline int
-__xfs_dir3_data_entsize(
-	bool	ftype,
-	int	n)
-{
-	int	size = offsetof(struct xfs_dir2_data_entry, name[0]);
-
-	size += n;
-	size += sizeof(xfs_dir2_data_off_t);
-	if (ftype)
-		size += sizeof(__uint8_t);
-	return roundup(size, XFS_DIR2_DATA_ALIGN);
-}
-static inline int
-xfs_dir3_data_entsize(
-	struct xfs_mount	*mp,
-	int			n)
-{
-	bool ftype = xfs_sb_version_hasftype(&mp->m_sb) ? true : false;
-	return __xfs_dir3_data_entsize(ftype, n);
-}
-
-static inline __uint8_t
-xfs_dir3_dirent_get_ftype(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_data_entry *dep)
-{
-	if (xfs_sb_version_hasftype(&mp->m_sb)) {
-		__uint8_t	type = dep->name[dep->namelen];
-
-		ASSERT(type < XFS_DIR3_FT_MAX);
-		if (type < XFS_DIR3_FT_MAX)
-			return type;
-
-	}
-	return XFS_DIR3_FT_UNKNOWN;
-}
-
-static inline void
-xfs_dir3_dirent_put_ftype(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_data_entry *dep,
-	__uint8_t		type)
-{
-	ASSERT(type < XFS_DIR3_FT_MAX);
-	ASSERT(dep->namelen != 0);
-
-	if (xfs_sb_version_hasftype(&mp->m_sb))
-		dep->name[dep->namelen] = type;
-}
-
-/*
- * Pointer to an entry's tag word.
- */
-static inline __be16 *
-xfs_dir3_data_entry_tag_p(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_data_entry *dep)
-{
-	return (__be16 *)((char *)dep +
-		xfs_dir3_data_entsize(mp, dep->namelen) - sizeof(__be16));
-}
-
-/*
- * Pointer to a freespace's tag word.
- */
-static inline __be16 *
-xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup)
-{
-	return (__be16 *)((char *)dup +
-			be16_to_cpu(dup->length) - sizeof(__be16));
-}
-
-static inline size_t
-xfs_dir3_data_hdr_size(bool dir3)
-{
-	if (dir3)
-		return sizeof(struct xfs_dir3_data_hdr);
-	return sizeof(struct xfs_dir2_data_hdr);
-}
-
-static inline size_t
-xfs_dir3_data_entry_offset(struct xfs_dir2_data_hdr *hdr)
-{
-	bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-		    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
-	return xfs_dir3_data_hdr_size(dir3);
-}
-
-static inline struct xfs_dir2_data_entry *
-xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr)
-{
-	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + xfs_dir3_data_entry_offset(hdr));
-}
-
-static inline struct xfs_dir2_data_unused *
-xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
-{
-	return (struct xfs_dir2_data_unused *)
-		((char *)hdr + xfs_dir3_data_entry_offset(hdr));
-}
-
-/*
- * Offsets of . and .. in data space (always block 0)
- *
- * XXX: there is scope for significant optimisation of the logic here. Right
- * now we are checking for "dir3 format" over and over again. Ideally we should
- * only do it once for each operation.
- */
-static inline xfs_dir2_data_aoff_t
-xfs_dir3_data_dot_offset(struct xfs_mount *mp)
-{
-	return xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&mp->m_sb));
-}
-
-static inline xfs_dir2_data_aoff_t
-xfs_dir3_data_dotdot_offset(struct xfs_mount *mp)
-{
-	return xfs_dir3_data_dot_offset(mp) +
-		xfs_dir3_data_entsize(mp, 1);
-}
-
-static inline xfs_dir2_data_aoff_t
-xfs_dir3_data_first_offset(struct xfs_mount *mp)
-{
-	return xfs_dir3_data_dotdot_offset(mp) +
-		xfs_dir3_data_entsize(mp, 2);
-}
-
-/*
- * location of . and .. in data space (always block 0)
- */
-static inline struct xfs_dir2_data_entry *
-xfs_dir3_data_dot_entry_p(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_data_hdr *hdr)
-{
-	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + xfs_dir3_data_dot_offset(mp));
-}
-
-static inline struct xfs_dir2_data_entry *
-xfs_dir3_data_dotdot_entry_p(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_data_hdr *hdr)
-{
-	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + xfs_dir3_data_dotdot_offset(mp));
-}
-
-static inline struct xfs_dir2_data_entry *
-xfs_dir3_data_first_entry_p(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_data_hdr *hdr)
-{
-	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + xfs_dir3_data_first_offset(mp));
-}
-
-/*
- * Leaf block structures.
- *
- * A pure leaf block looks like the following drawing on disk:
- *
- *    +---------------------------+
- *    | xfs_dir2_leaf_hdr_t       |
- *    +---------------------------+
- *    | xfs_dir2_leaf_entry_t     |
- *    | xfs_dir2_leaf_entry_t     |
- *    | xfs_dir2_leaf_entry_t     |
- *    | xfs_dir2_leaf_entry_t     |
- *    | ...                       |
- *    +---------------------------+
- *    | xfs_dir2_data_off_t       |
- *    | xfs_dir2_data_off_t       |
- *    | xfs_dir2_data_off_t       |
- *    | ...                       |
- *    +---------------------------+
- *    | xfs_dir2_leaf_tail_t      |
- *    +---------------------------+
- *
- * The xfs_dir2_data_off_t members (bests) and tail are at the end of the block
- * for single-leaf (magic = XFS_DIR2_LEAF1_MAGIC) blocks only, but not present
- * for directories with separate leaf nodes and free space blocks
- * (magic = XFS_DIR2_LEAFN_MAGIC).
- *
- * As all the entries are variable size structures the accessors below should
- * be used to iterate over them.
- */
-
-/*
- * Offset of the leaf/node space.  First block in this space
- * is the btree root.
- */
-#define	XFS_DIR2_LEAF_SPACE	1
-#define	XFS_DIR2_LEAF_OFFSET	(XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE)
-#define	XFS_DIR2_LEAF_FIRSTDB(mp)	\
-	xfs_dir2_byte_to_db(mp, XFS_DIR2_LEAF_OFFSET)
-
-/*
- * Leaf block header.
- */
-typedef struct xfs_dir2_leaf_hdr {
-	xfs_da_blkinfo_t	info;		/* header for da routines */
-	__be16			count;		/* count of entries */
-	__be16			stale;		/* count of stale entries */
-} xfs_dir2_leaf_hdr_t;
-
-struct xfs_dir3_leaf_hdr {
-	struct xfs_da3_blkinfo	info;		/* header for da routines */
-	__be16			count;		/* count of entries */
-	__be16			stale;		/* count of stale entries */
-	__be32			pad;		/* 64 bit alignment */
-};
-
-struct xfs_dir3_icleaf_hdr {
-	__uint32_t		forw;
-	__uint32_t		back;
-	__uint16_t		magic;
-	__uint16_t		count;
-	__uint16_t		stale;
-};
-
-/*
- * Leaf block entry.
- */
-typedef struct xfs_dir2_leaf_entry {
-	__be32			hashval;	/* hash value of name */
-	__be32			address;	/* address of data entry */
-} xfs_dir2_leaf_entry_t;
-
-/*
- * Leaf block tail.
- */
-typedef struct xfs_dir2_leaf_tail {
-	__be32			bestcount;
-} xfs_dir2_leaf_tail_t;
-
-/*
- * Leaf block.
- */
-typedef struct xfs_dir2_leaf {
-	xfs_dir2_leaf_hdr_t	hdr;			/* leaf header */
-	xfs_dir2_leaf_entry_t	__ents[];		/* entries */
-} xfs_dir2_leaf_t;
-
-struct xfs_dir3_leaf {
-	struct xfs_dir3_leaf_hdr	hdr;		/* leaf header */
-	struct xfs_dir2_leaf_entry	__ents[];	/* entries */
-};
-
-#define XFS_DIR3_LEAF_CRC_OFF  offsetof(struct xfs_dir3_leaf_hdr, info.crc)
-
-extern void xfs_dir3_leaf_hdr_from_disk(struct xfs_dir3_icleaf_hdr *to,
-					struct xfs_dir2_leaf *from);
-
-static inline int
-xfs_dir3_leaf_hdr_size(struct xfs_dir2_leaf *lp)
-{
-	if (lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
-	    lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC))
-		return sizeof(struct xfs_dir3_leaf_hdr);
-	return sizeof(struct xfs_dir2_leaf_hdr);
-}
-
-static inline int
-xfs_dir3_max_leaf_ents(struct xfs_mount *mp, struct xfs_dir2_leaf *lp)
-{
-	return (mp->m_dirblksize - xfs_dir3_leaf_hdr_size(lp)) /
-		(uint)sizeof(struct xfs_dir2_leaf_entry);
-}
-
-/*
- * Get address of the bestcount field in the single-leaf block.
- */
-static inline struct xfs_dir2_leaf_entry *
-xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp)
-{
-	if (lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
-	    lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
-		struct xfs_dir3_leaf *lp3 = (struct xfs_dir3_leaf *)lp;
-		return lp3->__ents;
-	}
-	return lp->__ents;
-}
-
-/*
- * Get address of the bestcount field in the single-leaf block.
- */
-static inline struct xfs_dir2_leaf_tail *
-xfs_dir2_leaf_tail_p(struct xfs_mount *mp, struct xfs_dir2_leaf *lp)
-{
-	return (struct xfs_dir2_leaf_tail *)
-		((char *)lp + mp->m_dirblksize -
-		  sizeof(struct xfs_dir2_leaf_tail));
-}
-
-/*
- * Get address of the bests array in the single-leaf block.
- */
-static inline __be16 *
-xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp)
-{
-	return (__be16 *)ltp - be32_to_cpu(ltp->bestcount);
-}
-
-/*
- * DB blocks here are logical directory block numbers, not filesystem blocks.
- */
-
-/*
- * Convert dataptr to byte in file space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
-{
-	return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
-}
-
-/*
- * Convert byte in file space to dataptr.  It had better be aligned.
- */
-static inline xfs_dir2_dataptr_t
-xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by)
-{
-	return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
-}
-
-/*
- * Convert byte in space to (DB) block
- */
-static inline xfs_dir2_db_t
-xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by)
-{
-	return (xfs_dir2_db_t)
-		(by >> (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog));
-}
-
-/*
- * Convert dataptr to a block number
- */
-static inline xfs_dir2_db_t
-xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
-{
-	return xfs_dir2_byte_to_db(mp, xfs_dir2_dataptr_to_byte(mp, dp));
-}
-
-/*
- * Convert byte in space to offset in a block
- */
-static inline xfs_dir2_data_aoff_t
-xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by)
-{
-	return (xfs_dir2_data_aoff_t)(by &
-		((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) - 1));
-}
-
-/*
- * Convert dataptr to a byte offset in a block
- */
-static inline xfs_dir2_data_aoff_t
-xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
-{
-	return xfs_dir2_byte_to_off(mp, xfs_dir2_dataptr_to_byte(mp, dp));
-}
-
-/*
- * Convert block and offset to byte in space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db,
-			xfs_dir2_data_aoff_t o)
-{
-	return ((xfs_dir2_off_t)db <<
-		(mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) + o;
-}
-
-/*
- * Convert block (DB) to block (dablk)
- */
-static inline xfs_dablk_t
-xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db)
-{
-	return (xfs_dablk_t)(db << mp->m_sb.sb_dirblklog);
-}
-
-/*
- * Convert byte in space to (DA) block
- */
-static inline xfs_dablk_t
-xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by)
-{
-	return xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, by));
-}
-
-/*
- * Convert block and offset to dataptr
- */
-static inline xfs_dir2_dataptr_t
-xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db,
-			   xfs_dir2_data_aoff_t o)
-{
-	return xfs_dir2_byte_to_dataptr(mp, xfs_dir2_db_off_to_byte(mp, db, o));
-}
-
-/*
- * Convert block (dablk) to block (DB)
- */
-static inline xfs_dir2_db_t
-xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da)
-{
-	return (xfs_dir2_db_t)(da >> mp->m_sb.sb_dirblklog);
-}
-
-/*
- * Convert block (dablk) to byte offset in space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da)
-{
-	return xfs_dir2_db_off_to_byte(mp, xfs_dir2_da_to_db(mp, da), 0);
-}
-
-/*
- * Free space block defintions for the node format.
- */
-
-/*
- * Offset of the freespace index.
- */
-#define	XFS_DIR2_FREE_SPACE	2
-#define	XFS_DIR2_FREE_OFFSET	(XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE)
-#define	XFS_DIR2_FREE_FIRSTDB(mp)	\
-	xfs_dir2_byte_to_db(mp, XFS_DIR2_FREE_OFFSET)
-
-typedef	struct xfs_dir2_free_hdr {
-	__be32			magic;		/* XFS_DIR2_FREE_MAGIC */
-	__be32			firstdb;	/* db of first entry */
-	__be32			nvalid;		/* count of valid entries */
-	__be32			nused;		/* count of used entries */
-} xfs_dir2_free_hdr_t;
-
-typedef struct xfs_dir2_free {
-	xfs_dir2_free_hdr_t	hdr;		/* block header */
-	__be16			bests[];	/* best free counts */
-						/* unused entries are -1 */
-} xfs_dir2_free_t;
-
-struct xfs_dir3_free_hdr {
-	struct xfs_dir3_blk_hdr	hdr;
-	__be32			firstdb;	/* db of first entry */
-	__be32			nvalid;		/* count of valid entries */
-	__be32			nused;		/* count of used entries */
-	__be32			pad;		/* 64 bit alignment */
-};
-
-struct xfs_dir3_free {
-	struct xfs_dir3_free_hdr hdr;
-	__be16			bests[];	/* best free counts */
-						/* unused entries are -1 */
-};
-
-#define XFS_DIR3_FREE_CRC_OFF  offsetof(struct xfs_dir3_free, hdr.hdr.crc)
-
-/*
- * In core version of the free block header, abstracted away from on-disk format
- * differences. Use this in the code, and convert to/from the disk version using
- * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk.
- */
-struct xfs_dir3_icfree_hdr {
-	__uint32_t	magic;
-	__uint32_t	firstdb;
-	__uint32_t	nvalid;
-	__uint32_t	nused;
-
-};
-
-void xfs_dir3_free_hdr_from_disk(struct xfs_dir3_icfree_hdr *to,
-				 struct xfs_dir2_free *from);
-
-static inline int
-xfs_dir3_free_hdr_size(struct xfs_mount *mp)
-{
-	if (xfs_sb_version_hascrc(&mp->m_sb))
-		return sizeof(struct xfs_dir3_free_hdr);
-	return sizeof(struct xfs_dir2_free_hdr);
-}
-
-static inline int
-xfs_dir3_free_max_bests(struct xfs_mount *mp)
-{
-	return (mp->m_dirblksize - xfs_dir3_free_hdr_size(mp)) /
-		sizeof(xfs_dir2_data_off_t);
-}
-
-static inline __be16 *
-xfs_dir3_free_bests_p(struct xfs_mount *mp, struct xfs_dir2_free *free)
-{
-	return (__be16 *)((char *)free + xfs_dir3_free_hdr_size(mp));
-}
-
-/*
- * Convert data space db to the corresponding free db.
- */
-static inline xfs_dir2_db_t
-xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db)
-{
-	return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir3_free_max_bests(mp);
-}
-
-/*
- * Convert data space db to the corresponding index in a free db.
- */
-static inline int
-xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
-{
-	return db % xfs_dir3_free_max_bests(mp);
-}
-
-/*
- * Single block format.
- *
- * The single block format looks like the following drawing on disk:
- *
- *    +-------------------------------------------------+
- *    | xfs_dir2_data_hdr_t                             |
- *    +-------------------------------------------------+
- *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
- *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
- *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t :
- *    | ...                                             |
- *    +-------------------------------------------------+
- *    | unused space                                    |
- *    +-------------------------------------------------+
- *    | ...                                             |
- *    | xfs_dir2_leaf_entry_t                           |
- *    | xfs_dir2_leaf_entry_t                           |
- *    +-------------------------------------------------+
- *    | xfs_dir2_block_tail_t                           |
- *    +-------------------------------------------------+
- *
- * As all the entries are variable size structures the accessors below should
- * be used to iterate over them.
- */
-
-typedef struct xfs_dir2_block_tail {
-	__be32		count;			/* count of leaf entries */
-	__be32		stale;			/* count of stale lf entries */
-} xfs_dir2_block_tail_t;
-
-/*
- * Pointer to the leaf header embedded in a data block (1-block format)
- */
-static inline struct xfs_dir2_block_tail *
-xfs_dir2_block_tail_p(struct xfs_mount *mp, struct xfs_dir2_data_hdr *hdr)
-{
-	return ((struct xfs_dir2_block_tail *)
-		((char *)hdr + mp->m_dirblksize)) - 1;
-}
-
-/*
- * Pointer to the leaf entries embedded in a data block (1-block format)
- */
-static inline struct xfs_dir2_leaf_entry *
-xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
-{
-	return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count);
-}
-
-#endif /* __XFS_DIR2_FORMAT_H__ */
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 1021c8356d08..d5937dab8f56 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -25,12 +25,12 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
-#include "xfs_dir2_format.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 34df052683eb..85ff354db4ae 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -24,12 +24,12 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
-#include "xfs_dir2_format.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 8f84153e98a8..6b1ffb37654a 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -25,11 +25,11 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_dir2_format.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 3ef6d402084c..8d925963a1f2 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -23,13 +23,13 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_error.h"
-#include "xfs_dir2_format.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_trace.h"
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 066df425c14f..96be1d63da9b 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -22,8 +22,7 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_dir2_format.h"
+#include "xfs_da_format.h"
 #include "xfs_dir2.h"
 #include "xfs_export.h"
 #include "xfs_bmap_btree.h"
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 57d95b2f8692..cc22aa1b7b3b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -23,6 +23,8 @@
 #include "xfs_ag.h"
 #include "xfs_trans.h"
 #include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc.h"
 #include "xfs_dinode.h"
@@ -31,8 +33,6 @@
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
 #include "xfs_error.h"
-#include "xfs_da_btree.h"
-#include "xfs_dir2_format.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_ioctl.h"
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 1aaef689794d..23d78885c743 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -29,8 +29,8 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_da_btree.h"
-#include "xfs_dir2_format.h"
 #include "xfs_dir2.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 24fe333b8c41..d046dc8c1925 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -27,6 +27,7 @@
 #include "xfs_alloc.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
@@ -42,7 +43,6 @@
 #include "xfs_icache.h"
 #include "xfs_symlink.h"
 #include "xfs_da_btree.h"
-#include "xfs_dir2_format.h"
 #include "xfs_dir2_priv.h"
 
 #include <linux/capability.h>
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index cbefc1235083..4d4f4a0bb5e4 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -26,6 +26,7 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_error.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
@@ -50,7 +51,6 @@
 /* Need all the magic numbers and buffer ops structures from these headers */
 #include "xfs_symlink.h"
 #include "xfs_da_btree.h"
-#include "xfs_dir2_format.h"
 #include "xfs_dir2.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_attr_remote.h"
diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/xfs_log_rlimit.c
index 0961310abe46..e598bb547dee 100644
--- a/fs/xfs/xfs_log_rlimit.c
+++ b/fs/xfs/xfs_log_rlimit.c
@@ -23,6 +23,7 @@
 #include "xfs_ag.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_trans_space.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 9515f96b4ab7..170e7294b7b0 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -27,8 +27,7 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_dir2_format.h"
+#include "xfs_da_format.h"
 #include "xfs_dir2.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
index b4762279bb78..73b24e15cef7 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/xfs_sb.c
@@ -26,8 +26,8 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_da_btree.h"
-#include "xfs_dir2_format.h"
 #include "xfs_dir2.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 4886b35b1ef1..a28eca47aa4c 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -27,6 +27,7 @@
 #include "xfs_alloc.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
@@ -45,7 +46,6 @@
 #include "xfs_trans_priv.h"
 #include "xfs_filestream.h"
 #include "xfs_da_btree.h"
-#include "xfs_dir2_format.h"
 #include "xfs_dir2.h"
 #include "xfs_extfree_item.h"
 #include "xfs_mru_cache.h"
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index cba781edb192..db32c504372c 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -26,8 +26,7 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_dir2_format.h"
+#include "xfs_da_format.h"
 #include "xfs_dir2.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_ialloc_btree.h"
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 29705391c896..6ca992020326 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -23,13 +23,14 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
-#include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_itable.h"
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 3693ce279143..06000a9647d5 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -26,7 +26,7 @@
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_error.h"
-#include "xfs_da_btree.h"
+#include "xfs_da_format.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
index b5d952dbe6a4..cc8361255c8e 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/xfs_trans_resv.c
@@ -26,6 +26,7 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_error.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index e01f35ea76ba..08047502683d 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -18,7 +18,12 @@
 
 #include "xfs.h"
 #include "xfs_log_format.h"
-#include "xfs_da_btree.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
 #include "xfs_attr.h"
-- 
cgit v1.3


From 9aede1d81b12a95771db0e2f9196dd37e10aaf20 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 15 Oct 2013 09:17:52 +1100
Subject: xfs: split dquot buffer operations out

Parts of userspace want to be able to read and modify dquot buffers
(e.g. xfs_db) so we need to split out the reading and writing of
these buffers so it is easy to shared code with libxfs in userspace.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/Makefile          |   1 +
 fs/xfs/xfs_dquot.c       | 117 +------------------
 fs/xfs/xfs_dquot_buf.c   | 288 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_log_recover.c | 133 +---------------------
 fs/xfs/xfs_qm.c          |  22 +---
 fs/xfs/xfs_qm.h          |   2 -
 fs/xfs/xfs_quota.h       |   2 -
 fs/xfs/xfs_quota_defs.h  |   4 +
 8 files changed, 303 insertions(+), 266 deletions(-)
 create mode 100644 fs/xfs/xfs_dquot_buf.c

(limited to 'fs')

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 0719e4db93f2..08f11bbf56ae 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -72,6 +72,7 @@ xfs-y				+= xfs_alloc.o \
 				   xfs_dir2_leaf.o \
 				   xfs_dir2_node.o \
 				   xfs_dir2_sf.o \
+				   xfs_dquot_buf.o \
 				   xfs_ialloc.o \
 				   xfs_ialloc_btree.o \
 				   xfs_icreate_item.o \
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 8b1d2c0a9054..f85a893c2398 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -293,118 +293,6 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
 	dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5;
 }
 
-STATIC bool
-xfs_dquot_buf_verify_crc(
-	struct xfs_mount	*mp,
-	struct xfs_buf		*bp)
-{
-	struct xfs_dqblk	*d = (struct xfs_dqblk *)bp->b_addr;
-	int			ndquots;
-	int			i;
-
-	if (!xfs_sb_version_hascrc(&mp->m_sb))
-		return true;
-
-	/*
-	 * if we are in log recovery, the quota subsystem has not been
-	 * initialised so we have no quotainfo structure. In that case, we need
-	 * to manually calculate the number of dquots in the buffer.
-	 */
-	if (mp->m_quotainfo)
-		ndquots = mp->m_quotainfo->qi_dqperchunk;
-	else
-		ndquots = xfs_qm_calc_dquots_per_chunk(mp,
-					XFS_BB_TO_FSB(mp, bp->b_length));
-
-	for (i = 0; i < ndquots; i++, d++) {
-		if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
-				 XFS_DQUOT_CRC_OFF))
-			return false;
-		if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
-			return false;
-	}
-	return true;
-}
-
-STATIC bool
-xfs_dquot_buf_verify(
-	struct xfs_mount	*mp,
-	struct xfs_buf		*bp)
-{
-	struct xfs_dqblk	*d = (struct xfs_dqblk *)bp->b_addr;
-	xfs_dqid_t		id = 0;
-	int			ndquots;
-	int			i;
-
-	/*
-	 * if we are in log recovery, the quota subsystem has not been
-	 * initialised so we have no quotainfo structure. In that case, we need
-	 * to manually calculate the number of dquots in the buffer.
-	 */
-	if (mp->m_quotainfo)
-		ndquots = mp->m_quotainfo->qi_dqperchunk;
-	else
-		ndquots = xfs_qm_calc_dquots_per_chunk(mp, bp->b_length);
-
-	/*
-	 * On the first read of the buffer, verify that each dquot is valid.
-	 * We don't know what the id of the dquot is supposed to be, just that
-	 * they should be increasing monotonically within the buffer. If the
-	 * first id is corrupt, then it will fail on the second dquot in the
-	 * buffer so corruptions could point to the wrong dquot in this case.
-	 */
-	for (i = 0; i < ndquots; i++) {
-		struct xfs_disk_dquot	*ddq;
-		int			error;
-
-		ddq = &d[i].dd_diskdq;
-
-		if (i == 0)
-			id = be32_to_cpu(ddq->d_id);
-
-		error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
-				       "xfs_dquot_buf_verify");
-		if (error)
-			return false;
-	}
-	return true;
-}
-
-static void
-xfs_dquot_buf_read_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-
-	if (!xfs_dquot_buf_verify_crc(mp, bp) || !xfs_dquot_buf_verify(mp, bp)) {
-		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-	}
-}
-
-/*
- * we don't calculate the CRC here as that is done when the dquot is flushed to
- * the buffer after the update is done. This ensures that the dquot in the
- * buffer always has an up-to-date CRC value.
- */
-void
-xfs_dquot_buf_write_verify(
-	struct xfs_buf	*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-
-	if (!xfs_dquot_buf_verify(mp, bp)) {
-		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-		return;
-	}
-}
-
-const struct xfs_buf_ops xfs_dquot_buf_ops = {
-	.verify_read = xfs_dquot_buf_read_verify,
-	.verify_write = xfs_dquot_buf_write_verify,
-};
-
 /*
  * Allocate a block and fill it with dquots.
  * This is called when the bmapi finds a hole.
@@ -515,6 +403,7 @@ xfs_qm_dqalloc(
 
 	return (error);
 }
+
 STATIC int
 xfs_qm_dqrepair(
 	struct xfs_mount	*mp,
@@ -548,7 +437,7 @@ xfs_qm_dqrepair(
 	/* Do the actual repair of dquots in this buffer */
 	for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
 		ddq = &d[i].dd_diskdq;
-		error = xfs_qm_dqcheck(mp, ddq, firstid + i,
+		error = xfs_dqcheck(mp, ddq, firstid + i,
 				       dqp->dq_flags & XFS_DQ_ALLTYPES,
 				       XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair");
 		if (error) {
@@ -1134,7 +1023,7 @@ xfs_qm_dqflush(
 	/*
 	 * A simple sanity check in case we got a corrupted dquot..
 	 */
-	error = xfs_qm_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
+	error = xfs_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
 			   XFS_QMOPT_DOWARN, "dqflush (incore copy)");
 	if (error) {
 		xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/xfs_dquot_buf.c
new file mode 100644
index 000000000000..ad2c18fbdb05
--- /dev/null
+++ b/fs/xfs/xfs_dquot_buf.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_error.h"
+#include "xfs_cksum.h"
+#include "xfs_trace.h"
+
+int
+xfs_calc_dquots_per_chunk(
+	struct xfs_mount	*mp,
+	unsigned int		nbblks)	/* basic block units */
+{
+	unsigned int	ndquots;
+
+	ASSERT(nbblks > 0);
+	ndquots = BBTOB(nbblks);
+	do_div(ndquots, sizeof(xfs_dqblk_t));
+
+	return ndquots;
+}
+
+/*
+ * Do some primitive error checking on ondisk dquot data structures.
+ */
+int
+xfs_dqcheck(
+	struct xfs_mount *mp,
+	xfs_disk_dquot_t *ddq,
+	xfs_dqid_t	 id,
+	uint		 type,	  /* used only when IO_dorepair is true */
+	uint		 flags,
+	char		 *str)
+{
+	xfs_dqblk_t	 *d = (xfs_dqblk_t *)ddq;
+	int		errs = 0;
+
+	/*
+	 * We can encounter an uninitialized dquot buffer for 2 reasons:
+	 * 1. If we crash while deleting the quotainode(s), and those blks got
+	 *    used for user data. This is because we take the path of regular
+	 *    file deletion; however, the size field of quotainodes is never
+	 *    updated, so all the tricks that we play in itruncate_finish
+	 *    don't quite matter.
+	 *
+	 * 2. We don't play the quota buffers when there's a quotaoff logitem.
+	 *    But the allocation will be replayed so we'll end up with an
+	 *    uninitialized quota block.
+	 *
+	 * This is all fine; things are still consistent, and we haven't lost
+	 * any quota information. Just don't complain about bad dquot blks.
+	 */
+	if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) {
+		if (flags & XFS_QMOPT_DOWARN)
+			xfs_alert(mp,
+			"%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
+			str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
+		errs++;
+	}
+	if (ddq->d_version != XFS_DQUOT_VERSION) {
+		if (flags & XFS_QMOPT_DOWARN)
+			xfs_alert(mp,
+			"%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
+			str, id, ddq->d_version, XFS_DQUOT_VERSION);
+		errs++;
+	}
+
+	if (ddq->d_flags != XFS_DQ_USER &&
+	    ddq->d_flags != XFS_DQ_PROJ &&
+	    ddq->d_flags != XFS_DQ_GROUP) {
+		if (flags & XFS_QMOPT_DOWARN)
+			xfs_alert(mp,
+			"%s : XFS dquot ID 0x%x, unknown flags 0x%x",
+			str, id, ddq->d_flags);
+		errs++;
+	}
+
+	if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
+		if (flags & XFS_QMOPT_DOWARN)
+			xfs_alert(mp,
+			"%s : ondisk-dquot 0x%p, ID mismatch: "
+			"0x%x expected, found id 0x%x",
+			str, ddq, id, be32_to_cpu(ddq->d_id));
+		errs++;
+	}
+
+	if (!errs && ddq->d_id) {
+		if (ddq->d_blk_softlimit &&
+		    be64_to_cpu(ddq->d_bcount) >
+				be64_to_cpu(ddq->d_blk_softlimit)) {
+			if (!ddq->d_btimer) {
+				if (flags & XFS_QMOPT_DOWARN)
+					xfs_alert(mp,
+			"%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
+					str, (int)be32_to_cpu(ddq->d_id), ddq);
+				errs++;
+			}
+		}
+		if (ddq->d_ino_softlimit &&
+		    be64_to_cpu(ddq->d_icount) >
+				be64_to_cpu(ddq->d_ino_softlimit)) {
+			if (!ddq->d_itimer) {
+				if (flags & XFS_QMOPT_DOWARN)
+					xfs_alert(mp,
+			"%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
+					str, (int)be32_to_cpu(ddq->d_id), ddq);
+				errs++;
+			}
+		}
+		if (ddq->d_rtb_softlimit &&
+		    be64_to_cpu(ddq->d_rtbcount) >
+				be64_to_cpu(ddq->d_rtb_softlimit)) {
+			if (!ddq->d_rtbtimer) {
+				if (flags & XFS_QMOPT_DOWARN)
+					xfs_alert(mp,
+			"%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
+					str, (int)be32_to_cpu(ddq->d_id), ddq);
+				errs++;
+			}
+		}
+	}
+
+	if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
+		return errs;
+
+	if (flags & XFS_QMOPT_DOWARN)
+		xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
+
+	/*
+	 * Typically, a repair is only requested by quotacheck.
+	 */
+	ASSERT(id != -1);
+	ASSERT(flags & XFS_QMOPT_DQREPAIR);
+	memset(d, 0, sizeof(xfs_dqblk_t));
+
+	d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+	d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+	d->dd_diskdq.d_flags = type;
+	d->dd_diskdq.d_id = cpu_to_be32(id);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+		xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+				 XFS_DQUOT_CRC_OFF);
+	}
+
+	return errs;
+}
+
+STATIC bool
+xfs_dquot_buf_verify_crc(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dqblk	*d = (struct xfs_dqblk *)bp->b_addr;
+	int			ndquots;
+	int			i;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return true;
+
+	/*
+	 * if we are in log recovery, the quota subsystem has not been
+	 * initialised so we have no quotainfo structure. In that case, we need
+	 * to manually calculate the number of dquots in the buffer.
+	 */
+	if (mp->m_quotainfo)
+		ndquots = mp->m_quotainfo->qi_dqperchunk;
+	else
+		ndquots = xfs_calc_dquots_per_chunk(mp,
+					XFS_BB_TO_FSB(mp, bp->b_length));
+
+	for (i = 0; i < ndquots; i++, d++) {
+		if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
+				 XFS_DQUOT_CRC_OFF))
+			return false;
+		if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
+			return false;
+	}
+	return true;
+}
+
+STATIC bool
+xfs_dquot_buf_verify(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dqblk	*d = (struct xfs_dqblk *)bp->b_addr;
+	xfs_dqid_t		id = 0;
+	int			ndquots;
+	int			i;
+
+	/*
+	 * if we are in log recovery, the quota subsystem has not been
+	 * initialised so we have no quotainfo structure. In that case, we need
+	 * to manually calculate the number of dquots in the buffer.
+	 */
+	if (mp->m_quotainfo)
+		ndquots = mp->m_quotainfo->qi_dqperchunk;
+	else
+		ndquots = xfs_calc_dquots_per_chunk(mp, bp->b_length);
+
+	/*
+	 * On the first read of the buffer, verify that each dquot is valid.
+	 * We don't know what the id of the dquot is supposed to be, just that
+	 * they should be increasing monotonically within the buffer. If the
+	 * first id is corrupt, then it will fail on the second dquot in the
+	 * buffer so corruptions could point to the wrong dquot in this case.
+	 */
+	for (i = 0; i < ndquots; i++) {
+		struct xfs_disk_dquot	*ddq;
+		int			error;
+
+		ddq = &d[i].dd_diskdq;
+
+		if (i == 0)
+			id = be32_to_cpu(ddq->d_id);
+
+		error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
+				       "xfs_dquot_buf_verify");
+		if (error)
+			return false;
+	}
+	return true;
+}
+
+static void
+xfs_dquot_buf_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if (!xfs_dquot_buf_verify_crc(mp, bp) || !xfs_dquot_buf_verify(mp, bp)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
+}
+
+/*
+ * we don't calculate the CRC here as that is done when the dquot is flushed to
+ * the buffer after the update is done. This ensures that the dquot in the
+ * buffer always has an up-to-date CRC value.
+ */
+void
+xfs_dquot_buf_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if (!xfs_dquot_buf_verify(mp, bp)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		return;
+	}
+}
+
+const struct xfs_buf_ops xfs_dquot_buf_ops = {
+	.verify_read = xfs_dquot_buf_read_verify,
+	.verify_write = xfs_dquot_buf_write_verify,
+};
+
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 4d4f4a0bb5e4..cb3277e07844 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2363,7 +2363,7 @@ xlog_recover_do_reg_buffer(
 					item->ri_buf[i].i_len, __func__);
 				goto next;
 			}
-			error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr,
+			error = xfs_dqcheck(mp, item->ri_buf[i].i_addr,
 					       -1, 0, XFS_QMOPT_DOWARN,
 					       "dquot_buf_recover");
 			if (error)
@@ -2394,133 +2394,6 @@ xlog_recover_do_reg_buffer(
 		xlog_recover_validate_buf_type(mp, bp, buf_f);
 }
 
-/*
- * Do some primitive error checking on ondisk dquot data structures.
- */
-int
-xfs_qm_dqcheck(
-	struct xfs_mount *mp,
-	xfs_disk_dquot_t *ddq,
-	xfs_dqid_t	 id,
-	uint		 type,	  /* used only when IO_dorepair is true */
-	uint		 flags,
-	char		 *str)
-{
-	xfs_dqblk_t	 *d = (xfs_dqblk_t *)ddq;
-	int		errs = 0;
-
-	/*
-	 * We can encounter an uninitialized dquot buffer for 2 reasons:
-	 * 1. If we crash while deleting the quotainode(s), and those blks got
-	 *    used for user data. This is because we take the path of regular
-	 *    file deletion; however, the size field of quotainodes is never
-	 *    updated, so all the tricks that we play in itruncate_finish
-	 *    don't quite matter.
-	 *
-	 * 2. We don't play the quota buffers when there's a quotaoff logitem.
-	 *    But the allocation will be replayed so we'll end up with an
-	 *    uninitialized quota block.
-	 *
-	 * This is all fine; things are still consistent, and we haven't lost
-	 * any quota information. Just don't complain about bad dquot blks.
-	 */
-	if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) {
-		if (flags & XFS_QMOPT_DOWARN)
-			xfs_alert(mp,
-			"%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
-			str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
-		errs++;
-	}
-	if (ddq->d_version != XFS_DQUOT_VERSION) {
-		if (flags & XFS_QMOPT_DOWARN)
-			xfs_alert(mp,
-			"%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
-			str, id, ddq->d_version, XFS_DQUOT_VERSION);
-		errs++;
-	}
-
-	if (ddq->d_flags != XFS_DQ_USER &&
-	    ddq->d_flags != XFS_DQ_PROJ &&
-	    ddq->d_flags != XFS_DQ_GROUP) {
-		if (flags & XFS_QMOPT_DOWARN)
-			xfs_alert(mp,
-			"%s : XFS dquot ID 0x%x, unknown flags 0x%x",
-			str, id, ddq->d_flags);
-		errs++;
-	}
-
-	if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
-		if (flags & XFS_QMOPT_DOWARN)
-			xfs_alert(mp,
-			"%s : ondisk-dquot 0x%p, ID mismatch: "
-			"0x%x expected, found id 0x%x",
-			str, ddq, id, be32_to_cpu(ddq->d_id));
-		errs++;
-	}
-
-	if (!errs && ddq->d_id) {
-		if (ddq->d_blk_softlimit &&
-		    be64_to_cpu(ddq->d_bcount) >
-				be64_to_cpu(ddq->d_blk_softlimit)) {
-			if (!ddq->d_btimer) {
-				if (flags & XFS_QMOPT_DOWARN)
-					xfs_alert(mp,
-			"%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
-					str, (int)be32_to_cpu(ddq->d_id), ddq);
-				errs++;
-			}
-		}
-		if (ddq->d_ino_softlimit &&
-		    be64_to_cpu(ddq->d_icount) >
-				be64_to_cpu(ddq->d_ino_softlimit)) {
-			if (!ddq->d_itimer) {
-				if (flags & XFS_QMOPT_DOWARN)
-					xfs_alert(mp,
-			"%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
-					str, (int)be32_to_cpu(ddq->d_id), ddq);
-				errs++;
-			}
-		}
-		if (ddq->d_rtb_softlimit &&
-		    be64_to_cpu(ddq->d_rtbcount) >
-				be64_to_cpu(ddq->d_rtb_softlimit)) {
-			if (!ddq->d_rtbtimer) {
-				if (flags & XFS_QMOPT_DOWARN)
-					xfs_alert(mp,
-			"%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
-					str, (int)be32_to_cpu(ddq->d_id), ddq);
-				errs++;
-			}
-		}
-	}
-
-	if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
-		return errs;
-
-	if (flags & XFS_QMOPT_DOWARN)
-		xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
-
-	/*
-	 * Typically, a repair is only requested by quotacheck.
-	 */
-	ASSERT(id != -1);
-	ASSERT(flags & XFS_QMOPT_DQREPAIR);
-	memset(d, 0, sizeof(xfs_dqblk_t));
-
-	d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
-	d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
-	d->dd_diskdq.d_flags = type;
-	d->dd_diskdq.d_id = cpu_to_be32(id);
-
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
-		xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
-				 XFS_DQUOT_CRC_OFF);
-	}
-
-	return errs;
-}
-
 /*
  * Perform a dquot buffer recovery.
  * Simple algorithm: if we have found a QUOTAOFF log item of the same type
@@ -3126,7 +2999,7 @@ xlog_recover_dquot_pass2(
 	 */
 	dq_f = item->ri_buf[0].i_addr;
 	ASSERT(dq_f);
-	error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+	error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
 			   "xlog_recover_dquot_pass2 (log copy)");
 	if (error)
 		return XFS_ERROR(EIO);
@@ -3146,7 +3019,7 @@ xlog_recover_dquot_pass2(
 	 * was among a chunk of dquots created earlier, and we did some
 	 * minimal initialization then.
 	 */
-	error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+	error = xfs_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
 			   "xlog_recover_dquot_pass2");
 	if (error) {
 		xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index dc2d4f7ad7c9..6dfb4e320498 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -665,20 +665,6 @@ xfs_qm_dqdetach(
 	}
 }
 
-int
-xfs_qm_calc_dquots_per_chunk(
-	struct xfs_mount	*mp,
-	unsigned int		nbblks)	/* basic block units */
-{
-	unsigned int	ndquots;
-
-	ASSERT(nbblks > 0);
-	ndquots = BBTOB(nbblks);
-	do_div(ndquots, sizeof(xfs_dqblk_t));
-
-	return ndquots;
-}
-
 struct xfs_qm_isolate {
 	struct list_head	buffers;
 	struct list_head	dispose;
@@ -859,7 +845,7 @@ xfs_qm_init_quotainfo(
 
 	/* Precalc some constants */
 	qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
-	qinf->qi_dqperchunk = xfs_qm_calc_dquots_per_chunk(mp,
+	qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(mp,
 							qinf->qi_dqchunklen);
 
 	mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
@@ -1093,10 +1079,10 @@ xfs_qm_reset_dqcounts(
 		/*
 		 * Do a sanity check, and if needed, repair the dqblk. Don't
 		 * output any warnings because it's perfectly possible to
-		 * find uninitialised dquot blks. See comment in xfs_qm_dqcheck.
+		 * find uninitialised dquot blks. See comment in xfs_dqcheck.
 		 */
-		(void) xfs_qm_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
-				      "xfs_quotacheck");
+		xfs_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
+			    "xfs_quotacheck");
 		ddq->d_bcount = 0;
 		ddq->d_icount = 0;
 		ddq->d_rtbcount = 0;
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 2b602df9c242..a788b66a5cb1 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -103,8 +103,6 @@ xfs_dq_to_quota_inode(struct xfs_dquot *dqp)
 	return NULL;
 }
 
-extern int	xfs_qm_calc_dquots_per_chunk(struct xfs_mount *mp,
-					     unsigned int nbblks);
 extern void	xfs_trans_mod_dquot(struct xfs_trans *,
 					struct xfs_dquot *, uint, long);
 extern int	xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 84661d4431fa..5376dd406ba2 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -150,8 +150,6 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
 	xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \
 				f | XFS_QMOPT_RES_REGBLKS)
 
-extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
-				xfs_dqid_t, uint, uint, char *);
 extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
 
 #endif	/* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_quota_defs.h b/fs/xfs/xfs_quota_defs.h
index e6b0d6e1f4f2..b3b2b1065c0f 100644
--- a/fs/xfs/xfs_quota_defs.h
+++ b/fs/xfs/xfs_quota_defs.h
@@ -154,4 +154,8 @@ typedef __uint16_t	xfs_qwarncnt_t;
 		(XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
 #define XFS_QMOPT_RESBLK_MASK	(XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
 
+extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq,
+		       xfs_dqid_t id, uint type, uint flags, char *str);
+extern int xfs_calc_dquots_per_chunk(struct xfs_mount *mp, unsigned int nbblks);
+
 #endif	/* __XFS_QUOTA_H__ */
-- 
cgit v1.3


From d420e5c810bce5debce0238021b410d0ef99cf08 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 15 Oct 2013 09:17:53 +1100
Subject: xfs: remove unused transaction callback variables

We don't do callbacks at transaction commit time, no do we have any
infrastructure to set up or run such callbacks, so remove the
variables and typedefs for these operations. If we ever need to add
callbacks, we can reintroduce the variables at that time.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_trans.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 09cf40b89e8c..71c835e9e810 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -85,18 +85,11 @@ struct xfs_item_ops {
 #define XFS_ITEM_LOCKED		2
 #define XFS_ITEM_FLUSHING	3
 
-/*
- * This is the type of function which can be given to xfs_trans_callback()
- * to be called upon the transaction's commit to disk.
- */
-typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
-
 /*
  * This is the structure maintained for every active transaction.
  */
 typedef struct xfs_trans {
 	unsigned int		t_magic;	/* magic number */
-	xfs_log_callback_t	t_logcb;	/* log callback struct */
 	unsigned int		t_type;		/* transaction type */
 	unsigned int		t_log_res;	/* amt of log space resvd */
 	unsigned int		t_log_count;	/* count for perm log res */
-- 
cgit v1.3


From 239880ef6454ccff2ba8d762c3f86e8278f0ce1c Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 23 Oct 2013 10:50:10 +1100
Subject: xfs: decouple log and transaction headers

xfs_trans.h has a dependency on xfs_log.h for a couple of
structures. Most code that does transactions doesn't need to know
anything about the log, but this dependency means that they have to
include xfs_log.h. Decouple the xfs_trans.h and xfs_log.h header
files and clean up the includes to be in dependency order.

In doing this, remove the direct include of xfs_trans_reserve.h from
xfs_trans.h so that we remove the dependency between xfs_trans.h and
xfs_mount.h. Hence the xfs_trans.h include can be moved to the
indicate the actual dependencies other header files have on it.

Note that these are kernel only header files, so this does not
translate to any userspace changes at all.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_alloc.c          |  6 ++++--
 fs/xfs/xfs_alloc_btree.c    |  6 +++---
 fs/xfs/xfs_aops.c           |  7 ++++---
 fs/xfs/xfs_attr.c           |  8 ++++----
 fs/xfs/xfs_attr_inactive.c  |  8 ++++----
 fs/xfs/xfs_attr_leaf.c      |  7 +++----
 fs/xfs/xfs_attr_list.c      |  6 +++---
 fs/xfs/xfs_attr_remote.c    |  7 +++----
 fs/xfs/xfs_bit.c            |  5 +----
 fs/xfs/xfs_bmap.c           |  7 ++++---
 fs/xfs/xfs_bmap_btree.c     |  5 +++--
 fs/xfs/xfs_bmap_util.c      |  9 +++++----
 fs/xfs/xfs_btree.c          |  5 +++--
 fs/xfs/xfs_buf.c            |  5 +++--
 fs/xfs/xfs_buf_item.c       |  7 ++++---
 fs/xfs/xfs_da_btree.c       |  7 ++++---
 fs/xfs/xfs_dir2.c           |  6 +++---
 fs/xfs/xfs_dir2_block.c     |  6 +++---
 fs/xfs/xfs_dir2_data.c      |  6 +++---
 fs/xfs/xfs_dir2_leaf.c      |  6 +++---
 fs/xfs/xfs_dir2_node.c      |  6 +++---
 fs/xfs/xfs_dir2_readdir.c   |  6 +++---
 fs/xfs/xfs_dir2_sf.c        |  6 +++---
 fs/xfs/xfs_discard.c        |  5 +++--
 fs/xfs/xfs_dquot.c          | 10 ++++++----
 fs/xfs/xfs_dquot_buf.c      |  5 +++--
 fs/xfs/xfs_dquot_item.c     |  6 ++++--
 fs/xfs/xfs_error.c          |  5 ++---
 fs/xfs/xfs_export.c         |  7 ++++---
 fs/xfs/xfs_extent_busy.c    |  6 ++++--
 fs/xfs/xfs_extfree_item.c   |  8 ++++----
 fs/xfs/xfs_file.c           |  6 ++++--
 fs/xfs/xfs_filestream.c     | 10 +++++-----
 fs/xfs/xfs_fsops.c          |  9 +++++----
 fs/xfs/xfs_ialloc.c         |  7 ++++---
 fs/xfs/xfs_ialloc_btree.c   |  6 +++---
 fs/xfs/xfs_icache.c         |  9 ++++-----
 fs/xfs/xfs_icreate_item.c   |  5 +++--
 fs/xfs/xfs_inode.c          | 12 +++++++-----
 fs/xfs/xfs_inode_buf.c      |  7 ++++---
 fs/xfs/xfs_inode_fork.c     |  6 +++---
 fs/xfs/xfs_inode_item.c     |  8 ++++----
 fs/xfs/xfs_ioctl.c          |  7 ++++---
 fs/xfs/xfs_ioctl32.c        |  4 ++--
 fs/xfs/xfs_iomap.c          | 11 ++++++-----
 fs/xfs/xfs_iops.c           | 13 +++++++------
 fs/xfs/xfs_itable.c         |  4 ++--
 fs/xfs/xfs_log.c            |  8 +++++---
 fs/xfs/xfs_log.h            | 10 ++--------
 fs/xfs/xfs_log_cil.c        | 10 ++++++----
 fs/xfs/xfs_log_priv.h       |  7 ++++---
 fs/xfs/xfs_log_recover.c    | 13 +++++--------
 fs/xfs/xfs_log_rlimit.c     |  4 ++--
 fs/xfs/xfs_message.c        |  5 ++---
 fs/xfs/xfs_mount.c          | 10 ++++++----
 fs/xfs/xfs_qm.c             |  9 +++++----
 fs/xfs/xfs_qm_bhv.c         |  5 +++--
 fs/xfs/xfs_qm_syscalls.c    |  7 ++++---
 fs/xfs/xfs_quotaops.c       |  2 +-
 fs/xfs/xfs_rtalloc.c        |  7 ++++---
 fs/xfs/xfs_sb.c             |  7 ++++---
 fs/xfs/xfs_super.c          |  8 +++++---
 fs/xfs/xfs_symlink.c        |  8 +++++---
 fs/xfs/xfs_symlink_remote.c |  5 +++--
 fs/xfs/xfs_trace.c          |  8 +++++---
 fs/xfs/xfs_trans.c          | 10 ++++++----
 fs/xfs/xfs_trans.h          | 10 +++++-----
 fs/xfs/xfs_trans_ail.c      |  7 ++++---
 fs/xfs/xfs_trans_buf.c      |  5 +++--
 fs/xfs/xfs_trans_dquot.c    |  7 ++++---
 fs/xfs/xfs_trans_extfree.c  |  5 +++--
 fs/xfs/xfs_trans_inode.c    |  5 +++--
 fs/xfs/xfs_trans_priv.h     |  1 -
 fs/xfs/xfs_trans_resv.c     |  6 +++---
 fs/xfs/xfs_xattr.c          |  3 +--
 75 files changed, 276 insertions(+), 239 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 2bfca06e0ee7..999db7a95663 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -18,10 +18,10 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_log_format.h"
 #include "xfs_shared.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -36,7 +36,9 @@
 #include "xfs_error.h"
 #include "xfs_cksum.h"
 #include "xfs_trace.h"
+#include "xfs_trans.h"
 #include "xfs_buf_item.h"
+#include "xfs_log.h"
 
 struct workqueue_struct *xfs_alloc_wq;
 
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index cafc90251d19..0bd33bfbb789 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -17,9 +17,8 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -34,6 +33,7 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
+#include "xfs_trans.h"
 
 
 STATIC struct xfs_btree_cur *
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 2c61e9622f57..caa63cb3e2fd 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -16,16 +16,17 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
-#include "xfs_format.h"
 #include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index dd0f48ff8832..ce6f9e0b6c5f 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -17,12 +17,11 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_format.h"
 #include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -33,6 +32,7 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_alloc.h"
+#include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 60b60c008fb7..45480f5b6aea 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -18,11 +18,11 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_format.h"
 #include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -36,6 +36,7 @@
 #include "xfs_attr_remote.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
@@ -43,7 +44,6 @@
 #include "xfs_error.h"
 #include "xfs_quota.h"
 #include "xfs_trace.h"
-#include "xfs_trans_priv.h"
 
 /*
  * Look at all the extents for this logical region,
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 94838fe67386..c10c70c88ec9 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -18,11 +18,9 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -37,6 +35,7 @@
 #include "xfs_attr_remote.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 8ba8e4b5e310..a319eb4d123f 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -18,10 +18,9 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -36,6 +35,7 @@
 #include "xfs_attr_remote.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index 578f21b70875..1407191f90c3 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -18,11 +18,9 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -33,6 +31,7 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_alloc.h"
+#include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c
index 48228848f5ae..16ce44a2b43e 100644
--- a/fs/xfs/xfs_bit.c
+++ b/fs/xfs/xfs_bit.c
@@ -16,10 +16,7 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_buf_item.h"
+#include "xfs_log_format.h"
 
 /*
  * XFS bit manipulation routines, used in non-realtime code.
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index b5cb377bd2bb..c3756ac09658 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -17,12 +17,12 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_format.h"
 #include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
 #include "xfs_inum.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -37,6 +37,7 @@
 #include "xfs_btree.h"
 #include "xfs_mount.h"
 #include "xfs_itable.h"
+#include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_extfree_item.h"
 #include "xfs_alloc.h"
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index bb8de8e399c4..89ef4e67ff54 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -18,9 +18,9 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -29,6 +29,7 @@
 #include "xfs_ialloc_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 74ff478b9b75..271c0d6ceafd 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -18,12 +18,11 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_format.h"
 #include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -34,6 +33,7 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
+#include "xfs_trans.h"
 #include "xfs_extfree_item.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
@@ -44,6 +44,7 @@
 #include "xfs_trans_space.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
+#include "xfs_log.h"
 
 /* Kernel only BMAP related definitions and functions */
 
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 1a297e26e26c..08365117a933 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -18,9 +18,9 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -29,6 +29,7 @@
 #include "xfs_ialloc_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_buf_item.h"
 #include "xfs_btree.h"
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 75e4ea7b109a..c7f0b77dcb00 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -34,12 +34,13 @@
 #include <linux/backing-dev.h>
 #include <linux/freezer.h>
 
-#include "xfs_sb.h"
+#include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_log.h"
+#include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_trace.h"
+#include "xfs_log.h"
 
 static kmem_zone_t *xfs_buf_zone;
 
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index f1d85cfc0a54..b6d20c55282b 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -17,17 +17,18 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
+#include "xfs_log.h"
 
 
 kmem_zone_t	*xfs_buf_item_zone;
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 4ec68c361c4c..377011195975 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -18,11 +18,11 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_format.h"
 #include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -33,6 +33,7 @@
 #include "xfs_dir2_priv.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index f46777fa3077..cef16615f761 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -17,10 +17,9 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_inum.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -30,6 +29,7 @@
 #include "xfs_alloc_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_dir2.h"
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 5f172730bfa3..6c7cd3ce9e72 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -18,9 +18,8 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -29,6 +28,7 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_buf_item.h"
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 8cf8034ebab1..3f7edddb8eff 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -18,9 +18,8 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -32,6 +31,7 @@
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
+#include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_cksum.h"
 
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index d5937dab8f56..44fe4b83ac9b 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -18,10 +18,9 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -35,6 +34,7 @@
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
+#include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_cksum.h"
 
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 85ff354db4ae..7836f048102e 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -18,9 +18,8 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -34,6 +33,7 @@
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
+#include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_cksum.h"
 
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 6b1ffb37654a..048d582525cf 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -18,10 +18,9 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -35,6 +34,7 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_bmap.h"
+#include "xfs_trans.h"
 
 /*
  * Directory file type support functions
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 8d925963a1f2..ac3bb58b58b9 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -17,9 +17,8 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -28,6 +27,7 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_error.h"
 #include "xfs_dir2.h"
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 45560ee1a4ba..d38f34cecb40 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -17,8 +17,8 @@
  */
 #include "xfs.h"
 #include "xfs_format.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -33,6 +33,7 @@
 #include "xfs_extent_busy.h"
 #include "xfs_discard.h"
 #include "xfs_trace.h"
+#include "xfs_log.h"
 
 STATIC int
 xfs_trim_extents(
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index f85a893c2398..47a815d4a345 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -18,29 +18,31 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_log_format.h"
 #include "xfs_shared.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_attr.h"
+#include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
 #include "xfs_trans_priv.h"
 #include "xfs_qm.h"
 #include "xfs_cksum.h"
 #include "xfs_trace.h"
+#include "xfs_log.h"
 
 /*
  * Lock order:
diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/xfs_dquot_buf.c
index ad2c18fbdb05..3e84b3a57f57 100644
--- a/fs/xfs/xfs_dquot_buf.c
+++ b/fs/xfs/xfs_dquot_buf.c
@@ -19,15 +19,16 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
 #include "xfs_quota.h"
+#include "xfs_trans.h"
 #include "xfs_qm.h"
 #include "xfs_error.h"
 #include "xfs_cksum.h"
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index e838d84b4e85..f8a47f77c3fa 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -18,8 +18,8 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_alloc.h"
@@ -32,9 +32,11 @@
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_attr.h"
+#include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
 #include "xfs_qm.h"
+#include "xfs_log.h"
 
 static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
 {
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 40f2985a27bc..d6c188cc7d43 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -17,9 +17,8 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 96be1d63da9b..8502aee8e65d 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -16,9 +16,8 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -27,9 +26,11 @@
 #include "xfs_export.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
+#include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
+#include "xfs_log.h"
 
 /*
  * Note that we only accept fileids which are long enough rather than allow
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 74828a7895eb..de462640606f 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -20,9 +20,9 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_log_format.h"
 #include "xfs_shared.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -31,6 +31,8 @@
 #include "xfs_inode.h"
 #include "xfs_extent_busy.h"
 #include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
 
 void
 xfs_extent_busy_insert(
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index dc53e8febbbe..3680d04f973f 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -17,14 +17,14 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_buf_item.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_trans.h"
 #include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
 #include "xfs_extfree_item.h"
 
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index cc22aa1b7b3b..8b43bfa023af 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -18,10 +18,10 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
-#include "xfs_log.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_trans.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
@@ -29,6 +29,7 @@
 #include "xfs_alloc.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
@@ -37,6 +38,7 @@
 #include "xfs_dir2_priv.h"
 #include "xfs_ioctl.h"
 #include "xfs_trace.h"
+#include "xfs_log.h"
 
 #include <linux/aio.h>
 #include <linux/dcache.h>
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index ce78e654d37b..7f2c3c1c8ba4 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -16,15 +16,15 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
-#include "xfs_log.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_ag.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inum.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_ag.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_mount.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
 #include "xfs_alloc.h"
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index e5d2c43c65c4..ceaec3165921 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -17,11 +17,10 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_format.h"
 #include "xfs_shared.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -30,6 +29,7 @@
 #include "xfs_ialloc_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_btree.h"
 #include "xfs_error.h"
@@ -41,6 +41,7 @@
 #include "xfs_rtalloc.h"
 #include "xfs_filestream.h"
 #include "xfs_trace.h"
+#include "xfs_log.h"
 
 /*
  * File system operations
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 8530cabf128d..c973ff8c4a7c 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -17,12 +17,12 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_format.h"
 #include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
 #include "xfs_inum.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -38,6 +38,7 @@
 #include "xfs_error.h"
 #include "xfs_bmap.h"
 #include "xfs_cksum.h"
+#include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_icreate_item.h"
 #include "xfs_icache.h"
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 5448eb6b8c12..2a0b6158e4c4 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -17,10 +17,9 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -35,6 +34,7 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
+#include "xfs_trans.h"
 
 
 STATIC int
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 76a4ecd638c3..6db11556731d 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -18,12 +18,9 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_log_priv.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -32,6 +29,8 @@
 #include "xfs_dinode.h"
 #include "xfs_error.h"
 #include "xfs_filestream.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
 #include "xfs_inode_item.h"
 #include "xfs_quota.h"
 #include "xfs_trace.h"
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index df8baf38eb85..d2eaccfa73f4 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -18,12 +18,13 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
 #include "xfs_icreate_item.h"
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 23d78885c743..f0440dcb5d8c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -19,13 +19,11 @@
 
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_format.h"
 #include "xfs_shared.h"
-#include "xfs_log.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_trans_space.h"
-#include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -39,6 +37,8 @@
 #include "xfs_attr.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_trans_space.h"
+#include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_inode_item.h"
 #include "xfs_btree.h"
@@ -53,6 +53,8 @@
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_symlink.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
 
 kmem_zone_t *xfs_inode_zone;
 
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c
index 65fa56edecf3..c0f974b6cd46 100644
--- a/fs/xfs/xfs_inode_buf.c
+++ b/fs/xfs/xfs_inode_buf.c
@@ -17,10 +17,10 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_format.h"
 #include "xfs_shared.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -32,6 +32,7 @@
 #include "xfs_cksum.h"
 #include "xfs_icache.h"
 #include "xfs_ialloc.h"
+#include "xfs_trans.h"
 
 /*
  * Check that none of the inode's in the buffer have a next
diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c
index 4fa56fcb38d1..b89b58a3a350 100644
--- a/fs/xfs/xfs_inode_fork.c
+++ b/fs/xfs/xfs_inode_fork.c
@@ -20,10 +20,9 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
-#include "xfs_log.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -33,6 +32,7 @@
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_inode_item.h"
 #include "xfs_btree.h"
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 378081109844..9f8e85ef2681 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -17,19 +17,19 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_trans_priv.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
+#include "xfs_trans_priv.h"
 
 
 kmem_zone_t	*xfs_ili_zone;		/* inode log item zone */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index c6c62d12a934..ea65a73d49ba 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -17,10 +17,10 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_format.h"
 #include "xfs_shared.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_alloc.h"
@@ -35,6 +35,7 @@
 #include "xfs_attr.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
+#include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_fsops.h"
 #include "xfs_discard.h"
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index f671f7e472ac..1be3333b260c 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -22,8 +22,8 @@
 #include <asm/uaccess.h>
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d8fb6dfe75ed..adadf80709b8 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -17,21 +17,22 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_format.h"
 #include "xfs_shared.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_trans.h"
 #include "xfs_inode_item.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
 #include "xfs_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index d046dc8c1925..3f2ae19c8ca0 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -17,15 +17,12 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_format.h"
 #include "xfs_shared.h"
-#include "xfs_acl.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_bmap_btree.h"
@@ -33,10 +30,14 @@
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
+#include "xfs_acl.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_attr.h"
+#include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_inode_item.h"
 #include "xfs_trace.h"
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 4f50e637f33b..964b1caae085 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -18,9 +18,9 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
-#include "xfs_log.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_inum.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index fbaf70af3d61..5275ad886971 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -18,19 +18,21 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
 #include "xfs_log_priv.h"
 #include "xfs_buf_item.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_log_recover.h"
-#include "xfs_trans_priv.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_trace.h"
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 1c458487f000..e148719e0a5d 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -18,8 +18,6 @@
 #ifndef	__XFS_LOG_H__
 #define __XFS_LOG_H__
 
-#include "xfs_log_format.h"
-
 struct xfs_log_vec {
 	struct xfs_log_vec	*lv_next;	/* next lv in build list */
 	int			lv_niovecs;	/* number of iovecs in lv */
@@ -82,11 +80,7 @@ struct xlog_ticket;
 struct xfs_log_item;
 struct xfs_item_ops;
 struct xfs_trans;
-
-void	xfs_log_item_init(struct xfs_mount	*mp,
-			struct xfs_log_item	*item,
-			int			type,
-			const struct xfs_item_ops *ops);
+struct xfs_log_callback;
 
 xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
 		       struct xlog_ticket *ticket,
@@ -114,7 +108,7 @@ xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
 void	  xfs_log_space_wake(struct xfs_mount *mp);
 int	  xfs_log_notify(struct xfs_mount	*mp,
 			 struct xlog_in_core	*iclog,
-			 xfs_log_callback_t	*callback_entry);
+			 struct xfs_log_callback *callback_entry);
 int	  xfs_log_release_iclog(struct xfs_mount *mp,
 			 struct xlog_in_core	 *iclog);
 int	  xfs_log_reserve(struct xfs_mount *mp,
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index a844f77d2cc3..5eb51fc5eb84 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -17,11 +17,9 @@
 
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_log_format.h"
 #include "xfs_shared.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-#include "xfs_log_priv.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -29,6 +27,10 @@
 #include "xfs_alloc.h"
 #include "xfs_extent_busy.h"
 #include "xfs_discard.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
 
 /*
  * Allocate a new ticket. Failing to get a new ticket makes it really hard to
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index f80cff26fda9..9bc403a9e54f 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -22,6 +22,7 @@ struct xfs_buf;
 struct xlog;
 struct xlog_ticket;
 struct xfs_mount;
+struct xfs_log_callback;
 
 /*
  * Flags for log structure
@@ -227,8 +228,8 @@ typedef struct xlog_in_core {
 
 	/* Callback structures need their own cacheline */
 	spinlock_t		ic_callback_lock ____cacheline_aligned_in_smp;
-	xfs_log_callback_t	*ic_callback;
-	xfs_log_callback_t	**ic_callback_tail;
+	struct xfs_log_callback	*ic_callback;
+	struct xfs_log_callback	**ic_callback_tail;
 
 	/* reference counts need their own cacheline */
 	atomic_t		ic_refcnt ____cacheline_aligned_in_smp;
@@ -254,7 +255,7 @@ struct xfs_cil_ctx {
 	int			space_used;	/* aggregate size of regions */
 	struct list_head	busy_extents;	/* busy extents in chkpt */
 	struct xfs_log_vec	*lv_chain;	/* logvecs being pushed */
-	xfs_log_callback_t	log_cb;		/* completion callback hook. */
+	struct xfs_log_callback	log_cb;		/* completion callback hook. */
 	struct list_head	committing;	/* ctx committing list */
 };
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index cb3277e07844..832ba5d5eb3a 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -17,12 +17,12 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_format.h"
 #include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
 #include "xfs_inum.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -34,11 +34,12 @@
 #include "xfs_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
+#include "xfs_log.h"
 #include "xfs_log_priv.h"
-#include "xfs_buf_item.h"
 #include "xfs_log_recover.h"
 #include "xfs_extfree_item.h"
 #include "xfs_trans_priv.h"
@@ -46,14 +47,10 @@
 #include "xfs_cksum.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
-#include "xfs_icreate_item.h"
 
 /* Need all the magic numbers and buffer ops structures from these headers */
-#include "xfs_symlink.h"
 #include "xfs_da_btree.h"
 #include "xfs_dir2.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_attr_remote.h"
 
 #define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)
 
diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/xfs_log_rlimit.c
index e598bb547dee..3749bc1c00a4 100644
--- a/fs/xfs/xfs_log_rlimit.c
+++ b/fs/xfs/xfs_log_rlimit.c
@@ -18,8 +18,8 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_ag.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index 9163dc140532..63ca2f0420b1 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -17,9 +17,8 @@
 
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 170e7294b7b0..83f7810c53cd 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -17,17 +17,19 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_format.h"
 #include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
 #include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
 #include "xfs_dir2.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 6dfb4e320498..e2f675644b4d 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -18,14 +18,12 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_log_format.h"
 #include "xfs_shared.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_ialloc_btree.h"
@@ -33,10 +31,13 @@
 #include "xfs_inode.h"
 #include "xfs_ialloc.h"
 #include "xfs_itable.h"
+#include "xfs_alloc.h"
 #include "xfs_rtalloc.h"
+#include "xfs_quota.h"
 #include "xfs_error.h"
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
+#include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
 #include "xfs_qm.h"
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 3af50ccdfac1..704628961159 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -18,8 +18,8 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_alloc.h"
@@ -32,6 +32,7 @@
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_attr.h"
+#include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_qm.h"
 
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 174390703eb6..ab2326aaaeda 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -20,11 +20,11 @@
 
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_format.h"
 #include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_alloc.h"
@@ -32,6 +32,7 @@
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
+#include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_itable.h"
 #include "xfs_bmap.h"
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 1326d81596c2..2dd467b0b475 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -17,8 +17,8 @@
  */
 #include "xfs.h"
 #include "xfs_format.h"
+#include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_log.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 6f63de303e8e..23c5ca24a02d 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -17,11 +17,11 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_format.h"
 #include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -34,6 +34,7 @@
 #include "xfs_rtalloc.h"
 #include "xfs_fsops.h"
 #include "xfs_error.h"
+#include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_trans_space.h"
 #include "xfs_trace.h"
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
index 73b24e15cef7..1b0a6420f1b8 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/xfs_sb.c
@@ -18,11 +18,10 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
 #include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -44,6 +43,8 @@
 #include "xfs_fsops.h"
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
 #include "xfs_buf_item.h"
 
 /*
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index a28eca47aa4c..4e20b420ff72 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -17,11 +17,11 @@
  */
 
 #include "xfs.h"
-#include "xfs_format.h"
 #include "xfs_shared.h"
-#include "xfs_log.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_inum.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_alloc.h"
@@ -41,7 +41,9 @@
 #include "xfs_itable.h"
 #include "xfs_fsops.h"
 #include "xfs_attr.h"
+#include "xfs_trans.h"
 #include "xfs_buf_item.h"
+#include "xfs_log.h"
 #include "xfs_log_priv.h"
 #include "xfs_trans_priv.h"
 #include "xfs_filestream.h"
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index db32c504372c..93fc72c96fad 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -17,12 +17,12 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
+#include "xfs_shared.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
-#include "xfs_shared.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -41,7 +41,9 @@
 #include "xfs_trans_space.h"
 #include "xfs_trace.h"
 #include "xfs_symlink.h"
+#include "xfs_trans.h"
 #include "xfs_buf_item.h"
+#include "xfs_log.h"
 
 /* ----- Kernel only functions below ----- */
 STATIC int
diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/xfs_symlink_remote.c
index 4251cd8e8fec..bf59a2b45f8c 100644
--- a/fs/xfs/xfs_symlink_remote.c
+++ b/fs/xfs/xfs_symlink_remote.c
@@ -19,9 +19,9 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
+#include "xfs_log_format.h"
 #include "xfs_shared.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_trans_resv.h"
 #include "xfs_ag.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
@@ -31,6 +31,7 @@
 #include "xfs_trace.h"
 #include "xfs_symlink.h"
 #include "xfs_cksum.h"
+#include "xfs_trans.h"
 #include "xfs_buf_item.h"
 
 
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 6ca992020326..889bb4233323 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -17,10 +17,10 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_format.h"
 #include "xfs_shared.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -38,6 +38,8 @@
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
 #include "xfs_attr_leaf.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
 #include "xfs_log_priv.h"
 #include "xfs_buf_item.h"
 #include "xfs_quota.h"
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 06000a9647d5..8d1595f9fd50 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -18,10 +18,10 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_format.h"
 #include "xfs_shared.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -38,10 +38,12 @@
 #include "xfs_extent_busy.h"
 #include "xfs_bmap.h"
 #include "xfs_quota.h"
-#include "xfs_qm.h"
+#include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_trans_space.h"
+#include "xfs_qm.h"
 #include "xfs_inode_item.h"
+#include "xfs_log.h"
 #include "xfs_log_priv.h"
 #include "xfs_buf_item.h"
 #include "xfs_trace.h"
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 71c835e9e810..9cc5fbed4f26 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -18,10 +18,6 @@
 #ifndef	__XFS_TRANS_H__
 #define	__XFS_TRANS_H__
 
-struct xfs_log_item;
-
-#include "xfs_trans_resv.h"
-
 /* kernel only transaction subsystem defines */
 
 struct xfs_buf;
@@ -77,6 +73,9 @@ struct xfs_item_ops {
 	void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
 };
 
+void	xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
+			  int type, const struct xfs_item_ops *ops);
+
 /*
  * Return values for the iop_push() routines.
  */
@@ -85,6 +84,7 @@ struct xfs_item_ops {
 #define XFS_ITEM_LOCKED		2
 #define XFS_ITEM_FLUSHING	3
 
+
 /*
  * This is the structure maintained for every active transaction.
  */
@@ -125,7 +125,6 @@ typedef struct xfs_trans {
 	int64_t			t_rextents_delta;/* superblocks rextents chg */
 	int64_t			t_rextslog_delta;/* superblocks rextslog chg */
 	struct list_head	t_items;	/* log item descriptors */
-	xfs_trans_header_t	t_header;	/* header for in-log trans */
 	struct list_head	t_busy;		/* list of busy extents */
 	unsigned long		t_pflags;	/* saved process flags state */
 } xfs_trans_t;
@@ -230,6 +229,7 @@ void		xfs_trans_log_efd_extent(xfs_trans_t *,
 					 xfs_fsblock_t,
 					 xfs_extlen_t);
 int		xfs_trans_commit(xfs_trans_t *, uint flags);
+int		xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
 void		xfs_trans_cancel(xfs_trans_t *, int);
 int		xfs_trans_ail_init(struct xfs_mount *);
 void		xfs_trans_ail_destroy(struct xfs_mount *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 21c6d7ddbc06..4b47cfebd25b 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -18,15 +18,16 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_trace.h"
 #include "xfs_error.h"
+#include "xfs_log.h"
 
 #ifdef DEBUG
 /*
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 0167a65ec851..6b1b993257d6 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -18,8 +18,8 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -28,6 +28,7 @@
 #include "xfs_ialloc_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 4d630d2c2804..2025f6eea4aa 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -17,10 +17,10 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_format.h"
 #include "xfs_shared.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_alloc.h"
@@ -33,6 +33,7 @@
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_attr.h"
+#include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
 #include "xfs_qm.h"
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 3bfb98bec050..47978ba89dae 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -18,11 +18,12 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_extfree_item.h"
 
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index dcab52ec9cb6..b1a0efa6b32a 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -18,8 +18,8 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -29,6 +29,7 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
+#include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_inode_item.h"
 #include "xfs_trace.h"
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index c52def0b441c..12e86af9d9b9 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -27,7 +27,6 @@ struct xfs_log_vec;
 
 
 void	xfs_trans_init(struct xfs_mount *);
-int	xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
 void	xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
 void	xfs_trans_del_item(struct xfs_log_item *);
 void	xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
index cc8361255c8e..89de046331da 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/xfs_trans_resv.c
@@ -18,11 +18,10 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_format.h"
 #include "xfs_shared.h"
-#include "xfs_log.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -41,6 +40,7 @@
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
 #include "xfs_quota.h"
+#include "xfs_trans.h"
 #include "xfs_qm.h"
 #include "xfs_trans_space.h"
 #include "xfs_trace.h"
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 08047502683d..23052992c7e2 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -18,8 +18,7 @@
 
 #include "xfs.h"
 #include "xfs_log_format.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-- 
cgit v1.3


From a4fbe6ab1e7abecf42b75e9c73701ed33b4ab03b Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 23 Oct 2013 10:51:50 +1100
Subject: xfs: decouple inode and bmap btree header files

Currently the xfs_inode.h header has a dependency on the definition
of the BMAP btree records as the inode fork includes an array of
xfs_bmbt_rec_host_t objects in it's definition.

Move all the btree format definitions from xfs_btree.h,
xfs_bmap_btree.h, xfs_alloc_btree.h and xfs_ialloc_btree.h to
xfs_format.h to continue the process of centralising the on-disk
format definitions. With this done, the xfs inode definitions are no
longer dependent on btree header files.

The enables a massive culling of unnecessary includes, with close to
200 #include directives removed from the XFS kernel code base.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_acl.c           |   8 +-
 fs/xfs/xfs_alloc.c         |   5 +-
 fs/xfs/xfs_alloc_btree.c   |   7 +-
 fs/xfs/xfs_alloc_btree.h   |  33 ------
 fs/xfs/xfs_aops.c          |   4 +-
 fs/xfs/xfs_attr.c          |   4 +-
 fs/xfs/xfs_attr_inactive.c |   8 +-
 fs/xfs/xfs_attr_leaf.c     |  13 +--
 fs/xfs/xfs_attr_list.c     |  12 +--
 fs/xfs/xfs_attr_remote.c   |   5 +-
 fs/xfs/xfs_bmap.c          |  12 +--
 fs/xfs/xfs_bmap_btree.c    |   7 +-
 fs/xfs/xfs_bmap_btree.h    | 103 ------------------
 fs/xfs/xfs_bmap_util.c     |   6 +-
 fs/xfs/xfs_btree.c         |   5 +-
 fs/xfs/xfs_btree.h         |  79 +++-----------
 fs/xfs/xfs_buf_item.h      |   4 -
 fs/xfs/xfs_da_btree.c      |   2 -
 fs/xfs/xfs_dir2.c          |   5 +-
 fs/xfs/xfs_dir2_block.c    |   4 +-
 fs/xfs/xfs_dir2_data.c     |   3 +-
 fs/xfs/xfs_dir2_leaf.c     |   4 +-
 fs/xfs/xfs_dir2_node.c     |   3 +-
 fs/xfs/xfs_dir2_readdir.c  |   4 +-
 fs/xfs/xfs_dir2_sf.c       |   4 +-
 fs/xfs/xfs_discard.c       |   6 +-
 fs/xfs/xfs_dquot.c         |   5 +-
 fs/xfs/xfs_dquot_buf.c     |   2 -
 fs/xfs/xfs_dquot_item.c    |   8 +-
 fs/xfs/xfs_error.c         |   4 +-
 fs/xfs/xfs_export.c        |   2 +-
 fs/xfs/xfs_extent_busy.c   |   2 -
 fs/xfs/xfs_extent_busy.h   |   4 +
 fs/xfs/xfs_file.c          |   5 +-
 fs/xfs/xfs_filestream.c    |   4 +-
 fs/xfs/xfs_format.h        | 255 +++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_fsops.c         |  12 +--
 fs/xfs/xfs_ialloc.c        |   6 +-
 fs/xfs/xfs_ialloc.h        |   3 +-
 fs/xfs/xfs_ialloc_btree.c  |   6 +-
 fs/xfs/xfs_ialloc_btree.h  |  49 ---------
 fs/xfs/xfs_icache.c        |   4 -
 fs/xfs/xfs_inode.c         |  10 +-
 fs/xfs/xfs_inode.h         |   1 -
 fs/xfs/xfs_inode_buf.c     |   6 +-
 fs/xfs/xfs_inode_fork.c    |  16 +--
 fs/xfs/xfs_inode_fork.h    |   1 +
 fs/xfs/xfs_inode_item.c    |   4 +-
 fs/xfs/xfs_ioctl.c         |   9 +-
 fs/xfs/xfs_ioctl32.c       |   3 +-
 fs/xfs/xfs_iomap.c         |  16 +--
 fs/xfs/xfs_iops.c          |   8 +-
 fs/xfs/xfs_itable.c        |   9 +-
 fs/xfs/xfs_log.c           |   6 +-
 fs/xfs/xfs_log_recover.c   |  18 ++--
 fs/xfs/xfs_log_rlimit.c    |   3 +-
 fs/xfs/xfs_mount.c         |  15 +--
 fs/xfs/xfs_qm.c            |  11 +-
 fs/xfs/xfs_qm_bhv.c        |   7 --
 fs/xfs/xfs_qm_syscalls.c   |  10 +-
 fs/xfs/xfs_quotaops.c      |   3 +-
 fs/xfs/xfs_rtalloc.c       |   9 +-
 fs/xfs/xfs_sb.c            |  18 +---
 fs/xfs/xfs_super.c         |  16 +--
 fs/xfs/xfs_symlink.c       |   6 +-
 fs/xfs/xfs_trace.c         |   5 +-
 fs/xfs/xfs_trans.c         |  16 +--
 fs/xfs/xfs_trans.h         |   5 +
 fs/xfs/xfs_trans_buf.c     |   5 +-
 fs/xfs/xfs_trans_dquot.c   |   9 +-
 fs/xfs/xfs_trans_inode.c   |   6 +-
 fs/xfs/xfs_trans_resv.c    |  12 +--
 fs/xfs/xfs_xattr.c         |   2 +-
 73 files changed, 403 insertions(+), 583 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 0e2f37efedd0..370eb3e121d1 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -16,15 +16,15 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_acl.h"
-#include "xfs_attr.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
 #include "xfs_ag.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_acl.h"
+#include "xfs_attr.h"
 #include "xfs_trace.h"
 #include <linux/slab.h>
 #include <linux/xattr.h>
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 999db7a95663..bcf16528bac5 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -25,12 +25,9 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
 #include "xfs_alloc.h"
 #include "xfs_extent_busy.h"
 #include "xfs_error.h"
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 0bd33bfbb789..698587f6c60a 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -17,17 +17,14 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
 #include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
 #include "xfs_alloc.h"
 #include "xfs_extent_busy.h"
 #include "xfs_error.h"
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index 72676c36b1bb..45e189e7e81c 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -26,39 +26,6 @@ struct xfs_buf;
 struct xfs_btree_cur;
 struct xfs_mount;
 
-/*
- * There are two on-disk btrees, one sorted by blockno and one sorted
- * by blockcount and blockno.  All blocks look the same to make the code
- * simpler; if we have time later, we'll make the optimizations.
- */
-#define	XFS_ABTB_MAGIC		0x41425442	/* 'ABTB' for bno tree */
-#define	XFS_ABTB_CRC_MAGIC	0x41423342	/* 'AB3B' */
-#define	XFS_ABTC_MAGIC		0x41425443	/* 'ABTC' for cnt tree */
-#define	XFS_ABTC_CRC_MAGIC	0x41423343	/* 'AB3C' */
-
-/*
- * Data record/key structure
- */
-typedef struct xfs_alloc_rec {
-	__be32		ar_startblock;	/* starting block number */
-	__be32		ar_blockcount;	/* count of free blocks */
-} xfs_alloc_rec_t, xfs_alloc_key_t;
-
-typedef struct xfs_alloc_rec_incore {
-	xfs_agblock_t	ar_startblock;	/* starting block number */
-	xfs_extlen_t	ar_blockcount;	/* count of free blocks */
-} xfs_alloc_rec_incore_t;
-
-/* btree pointer type */
-typedef __be32 xfs_alloc_ptr_t;
-
-/*
- * Block numbers in the AG:
- * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3.
- */
-#define	XFS_BNO_BLOCK(mp)	((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
-#define	XFS_CNT_BLOCK(mp)	((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
-
 /*
  * Btree block header size depends on a superblock flag.
  */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index caa63cb3e2fd..eb310caf13b1 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -23,8 +23,6 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
@@ -34,6 +32,8 @@
 #include "xfs_trace.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
 #include <linux/aio.h>
 #include <linux/gfp.h>
 #include <linux/mpage.h>
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index ce6f9e0b6c5f..b86127072ac3 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -27,15 +27,14 @@
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
 #include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_alloc.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
 #include "xfs_attr.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_attr_remote.h"
@@ -43,6 +42,7 @@
 #include "xfs_quota.h"
 #include "xfs_trans_space.h"
 #include "xfs_trace.h"
+#include "xfs_dinode.h"
 
 /*
  * xfs_attr.c
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 45480f5b6aea..f33fb62b7f17 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -28,14 +28,9 @@
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
+#include "xfs_inode.h"
 #include "xfs_alloc.h"
-#include "xfs_btree.h"
 #include "xfs_attr_remote.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
@@ -44,6 +39,7 @@
 #include "xfs_error.h"
 #include "xfs_quota.h"
 #include "xfs_trace.h"
+#include "xfs_dinode.h"
 
 /*
  * Look at all the extents for this logical region,
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index c10c70c88ec9..a0f90193a247 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -18,6 +18,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
@@ -26,24 +27,20 @@
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_attr_remote.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
+#include "xfs_bmap_btree.h"
 #include "xfs_bmap.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr_remote.h"
 #include "xfs_attr.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_buf_item.h"
 #include "xfs_cksum.h"
+#include "xfs_dinode.h"
 
 
 /*
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index a319eb4d123f..46c4ce148a43 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -18,6 +18,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
@@ -26,24 +27,19 @@
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_attr_remote.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr_remote.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_buf_item.h"
 #include "xfs_cksum.h"
+#include "xfs_dinode.h"
 
 STATIC int
 xfs_attr_shortform_compare(const void *a, const void *b)
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index 1407191f90c3..2e5530467f2d 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -18,6 +18,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
@@ -25,10 +26,7 @@
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
-#include "xfs_error.h"
 #include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_alloc.h"
 #include "xfs_trans.h"
@@ -42,6 +40,7 @@
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
 #include "xfs_buf_item.h"
+#include "xfs_error.h"
 
 #define ATTR_RMTVALUE_MAPSIZE	1	/* # of map entries at once */
 
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index c3756ac09658..1c02da8bb7df 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -29,29 +29,25 @@
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_dir2.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
-#include "xfs_mount.h"
-#include "xfs_itable.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_extfree_item.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
-#include "xfs_attr_leaf.h"
 #include "xfs_quota.h"
 #include "xfs_trans_space.h"
 #include "xfs_buf_item.h"
-#include "xfs_filestream.h"
 #include "xfs_trace.h"
 #include "xfs_symlink.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_dinode.h"
+#include "xfs_filestream.h"
 
 
 kmem_zone_t		*xfs_bmap_free_item_zone;
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 89ef4e67ff54..2fb4a2202e17 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -24,21 +24,18 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
-#include "xfs_itable.h"
+#include "xfs_bmap_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
+#include "xfs_dinode.h"
 
 /*
  * Determine the extent state.
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index e2b0768a89ba..6e42e1e50b89 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -18,94 +18,12 @@
 #ifndef __XFS_BMAP_BTREE_H__
 #define __XFS_BMAP_BTREE_H__
 
-#define XFS_BMAP_MAGIC		0x424d4150	/* 'BMAP' */
-#define XFS_BMAP_CRC_MAGIC	0x424d4133	/* 'BMA3' */
-
 struct xfs_btree_cur;
 struct xfs_btree_block;
 struct xfs_mount;
 struct xfs_inode;
 struct xfs_trans;
 
-/*
- * Bmap root header, on-disk form only.
- */
-typedef struct xfs_bmdr_block {
-	__be16		bb_level;	/* 0 is a leaf */
-	__be16		bb_numrecs;	/* current # of data records */
-} xfs_bmdr_block_t;
-
-/*
- * Bmap btree record and extent descriptor.
- *  l0:63 is an extent flag (value 1 indicates non-normal).
- *  l0:9-62 are startoff.
- *  l0:0-8 and l1:21-63 are startblock.
- *  l1:0-20 are blockcount.
- */
-#define BMBT_EXNTFLAG_BITLEN	1
-#define BMBT_STARTOFF_BITLEN	54
-#define BMBT_STARTBLOCK_BITLEN	52
-#define BMBT_BLOCKCOUNT_BITLEN	21
-
-typedef struct xfs_bmbt_rec {
-	__be64			l0, l1;
-} xfs_bmbt_rec_t;
-
-typedef __uint64_t	xfs_bmbt_rec_base_t;	/* use this for casts */
-typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
-
-typedef struct xfs_bmbt_rec_host {
-	__uint64_t		l0, l1;
-} xfs_bmbt_rec_host_t;
-
-/*
- * Values and macros for delayed-allocation startblock fields.
- */
-#define STARTBLOCKVALBITS	17
-#define STARTBLOCKMASKBITS	(15 + XFS_BIG_BLKNOS * 20)
-#define DSTARTBLOCKMASKBITS	(15 + 20)
-#define STARTBLOCKMASK		\
-	(((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
-#define DSTARTBLOCKMASK		\
-	(((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
-
-static inline int isnullstartblock(xfs_fsblock_t x)
-{
-	return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
-}
-
-static inline int isnulldstartblock(xfs_dfsbno_t x)
-{
-	return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK;
-}
-
-static inline xfs_fsblock_t nullstartblock(int k)
-{
-	ASSERT(k < (1 << STARTBLOCKVALBITS));
-	return STARTBLOCKMASK | (k);
-}
-
-static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
-{
-	return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
-}
-
-/*
- * Possible extent formats.
- */
-typedef enum {
-	XFS_EXTFMT_NOSTATE = 0,
-	XFS_EXTFMT_HASSTATE
-} xfs_exntfmt_t;
-
-/*
- * Possible extent states.
- */
-typedef enum {
-	XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
-	XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID
-} xfs_exntst_t;
-
 /*
  * Extent state and extent format macros.
  */
@@ -114,27 +32,6 @@ typedef enum {
 		XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE)
 #define ISUNWRITTEN(x)	((x)->br_state == XFS_EXT_UNWRITTEN)
 
-/*
- * Incore version of above.
- */
-typedef struct xfs_bmbt_irec
-{
-	xfs_fileoff_t	br_startoff;	/* starting file offset */
-	xfs_fsblock_t	br_startblock;	/* starting block number */
-	xfs_filblks_t	br_blockcount;	/* number of blocks */
-	xfs_exntst_t	br_state;	/* extent state */
-} xfs_bmbt_irec_t;
-
-/*
- * Key structure for non-leaf levels of the tree.
- */
-typedef struct xfs_bmbt_key {
-	__be64		br_startoff;	/* starting file offset */
-} xfs_bmbt_key_t, xfs_bmdr_key_t;
-
-/* btree pointer type */
-typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
-
 /*
  * Btree block header size depends on a superblock flag.
  */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 271c0d6ceafd..5887e41c0323 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -27,10 +27,6 @@
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_trans.h"
@@ -38,6 +34,7 @@
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
@@ -45,6 +42,7 @@
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_log.h"
+#include "xfs_dinode.h"
 
 /* Kernel only BMAP related definitions and functions */
 
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 08365117a933..9adaae4f3e2f 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -18,16 +18,13 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 06729b67ad58..91e34f21bace 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -26,73 +26,6 @@ struct xfs_trans;
 
 extern kmem_zone_t	*xfs_btree_cur_zone;
 
-/*
- * This nonsense is to make -wlint happy.
- */
-#define	XFS_LOOKUP_EQ	((xfs_lookup_t)XFS_LOOKUP_EQi)
-#define	XFS_LOOKUP_LE	((xfs_lookup_t)XFS_LOOKUP_LEi)
-#define	XFS_LOOKUP_GE	((xfs_lookup_t)XFS_LOOKUP_GEi)
-
-#define	XFS_BTNUM_BNO	((xfs_btnum_t)XFS_BTNUM_BNOi)
-#define	XFS_BTNUM_CNT	((xfs_btnum_t)XFS_BTNUM_CNTi)
-#define	XFS_BTNUM_BMAP	((xfs_btnum_t)XFS_BTNUM_BMAPi)
-#define	XFS_BTNUM_INO	((xfs_btnum_t)XFS_BTNUM_INOi)
-
-/*
- * Generic btree header.
- *
- * This is a combination of the actual format used on disk for short and long
- * format btrees.  The first three fields are shared by both format, but the
- * pointers are different and should be used with care.
- *
- * To get the size of the actual short or long form headers please use the size
- * macros below.  Never use sizeof(xfs_btree_block).
- *
- * The blkno, crc, lsn, owner and uuid fields are only available in filesystems
- * with the crc feature bit, and all accesses to them must be conditional on
- * that flag.
- */
-struct xfs_btree_block {
-	__be32		bb_magic;	/* magic number for block type */
-	__be16		bb_level;	/* 0 is a leaf */
-	__be16		bb_numrecs;	/* current # of data records */
-	union {
-		struct {
-			__be32		bb_leftsib;
-			__be32		bb_rightsib;
-
-			__be64		bb_blkno;
-			__be64		bb_lsn;
-			uuid_t		bb_uuid;
-			__be32		bb_owner;
-			__le32		bb_crc;
-		} s;			/* short form pointers */
-		struct	{
-			__be64		bb_leftsib;
-			__be64		bb_rightsib;
-
-			__be64		bb_blkno;
-			__be64		bb_lsn;
-			uuid_t		bb_uuid;
-			__be64		bb_owner;
-			__le32		bb_crc;
-			__be32		bb_pad; /* padding for alignment */
-		} l;			/* long form pointers */
-	} bb_u;				/* rest */
-};
-
-#define XFS_BTREE_SBLOCK_LEN	16	/* size of a short form block */
-#define XFS_BTREE_LBLOCK_LEN	24	/* size of a long form block */
-
-/* sizes of CRC enabled btree blocks */
-#define XFS_BTREE_SBLOCK_CRC_LEN	(XFS_BTREE_SBLOCK_LEN + 40)
-#define XFS_BTREE_LBLOCK_CRC_LEN	(XFS_BTREE_LBLOCK_LEN + 48)
-
-#define XFS_BTREE_SBLOCK_CRC_OFF \
-	offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
-#define XFS_BTREE_LBLOCK_CRC_OFF \
-	offsetof(struct xfs_btree_block, bb_u.l.bb_crc)
-
 /*
  * Generic key, ptr and record wrapper structures.
  *
@@ -118,6 +51,18 @@ union xfs_btree_rec {
 	xfs_inobt_rec_t		inobt;
 };
 
+/*
+ * This nonsense is to make -wlint happy.
+ */
+#define	XFS_LOOKUP_EQ	((xfs_lookup_t)XFS_LOOKUP_EQi)
+#define	XFS_LOOKUP_LE	((xfs_lookup_t)XFS_LOOKUP_LEi)
+#define	XFS_LOOKUP_GE	((xfs_lookup_t)XFS_LOOKUP_GEi)
+
+#define	XFS_BTNUM_BNO	((xfs_btnum_t)XFS_BTNUM_BNOi)
+#define	XFS_BTNUM_CNT	((xfs_btnum_t)XFS_BTNUM_CNTi)
+#define	XFS_BTNUM_BMAP	((xfs_btnum_t)XFS_BTNUM_BMAPi)
+#define	XFS_BTNUM_INO	((xfs_btnum_t)XFS_BTNUM_INOi)
+
 /*
  * For logging record fields.
  */
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index db6371087fe8..3f3455a41510 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -71,10 +71,6 @@ void	xfs_buf_attach_iodone(struct xfs_buf *,
 void	xfs_buf_iodone_callbacks(struct xfs_buf *);
 void	xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
 
-void	xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *,
-			       enum xfs_blft);
-void	xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, struct xfs_buf *src_bp);
-
 extern kmem_zone_t	*xfs_buf_item_zone;
 
 #endif	/* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 377011195975..eb65c546ffd8 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -28,10 +28,8 @@
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index cef16615f761..38bf9324302c 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -17,6 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_inum.h"
@@ -25,9 +26,6 @@
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
@@ -36,6 +34,7 @@
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
+#include "xfs_dinode.h"
 
 struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
 
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 6c7cd3ce9e72..9f3f83a5e2da 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -18,6 +18,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
@@ -25,8 +26,6 @@
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
@@ -37,6 +36,7 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
+#include "xfs_dinode.h"
 
 /*
  * Local function prototypes.
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 3f7edddb8eff..ccfeb4d8376a 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -18,6 +18,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
@@ -25,8 +26,6 @@
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 44fe4b83ac9b..51fdc11a1e2c 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -18,16 +18,14 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_bit.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
 #include "xfs_dir2.h"
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 7836f048102e..b8381646b8af 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -18,6 +18,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
@@ -25,8 +26,6 @@
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
 #include "xfs_dir2.h"
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 048d582525cf..45c9ce8cdb28 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -18,6 +18,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
@@ -26,8 +27,6 @@
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
@@ -35,6 +34,7 @@
 #include "xfs_trace.h"
 #include "xfs_bmap.h"
 #include "xfs_trans.h"
+#include "xfs_dinode.h"
 
 /*
  * Directory file type support functions
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index ac3bb58b58b9..8811ee5eaec6 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -17,6 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
@@ -24,8 +25,6 @@
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
@@ -33,6 +32,7 @@
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_trace.h"
+#include "xfs_dinode.h"
 
 /*
  * Prototypes for internal functions.
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index d38f34cecb40..8367d6dc18c9 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -23,11 +23,9 @@
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_quota.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_btree.h"
 #include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 #include "xfs_extent_busy.h"
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 47a815d4a345..6b1e695caf0e 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -25,16 +25,12 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
 #include "xfs_alloc.h"
 #include "xfs_quota.h"
-#include "xfs_rtalloc.h"
 #include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_attr.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
@@ -43,6 +39,7 @@
 #include "xfs_cksum.h"
 #include "xfs_trace.h"
 #include "xfs_log.h"
+#include "xfs_bmap_btree.h"
 
 /*
  * Lock order:
diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/xfs_dquot_buf.c
index 3e84b3a57f57..aaaf41b29869 100644
--- a/fs/xfs/xfs_dquot_buf.c
+++ b/fs/xfs/xfs_dquot_buf.c
@@ -21,11 +21,9 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_bit.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
 #include "xfs_quota.h"
 #include "xfs_trans.h"
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index f8a47f77c3fa..92e5f62eefc6 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -22,16 +22,10 @@
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
+#include "xfs_quota.h"
 #include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_attr.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index d6c188cc7d43..9995b807d627 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -16,15 +16,13 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
+#include "xfs_format.h"
 #include "xfs_fs.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
 #include "xfs_error.h"
 
 #ifdef DEBUG
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 8502aee8e65d..1399e187d425 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -16,6 +16,7 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
@@ -24,7 +25,6 @@
 #include "xfs_da_format.h"
 #include "xfs_dir2.h"
 #include "xfs_export.h"
-#include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index de462640606f..fd22f69049d4 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -26,9 +26,7 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
 #include "xfs_alloc.h"
-#include "xfs_inode.h"
 #include "xfs_extent_busy.h"
 #include "xfs_trace.h"
 #include "xfs_trans.h"
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index 985412d65ba5..bfff284d2dcc 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -20,6 +20,10 @@
 #ifndef __XFS_EXTENT_BUSY_H__
 #define	__XFS_EXTENT_BUSY_H__
 
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_alloc_arg;
+
 /*
  * Busy block/extent entry.  Indexed by a rbtree in perag to mark blocks that
  * have been freed but whose transactions aren't committed to disk yet.
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 8b43bfa023af..52c91e143725 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
@@ -25,9 +26,6 @@
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
@@ -39,6 +37,7 @@
 #include "xfs_ioctl.h"
 #include "xfs_trace.h"
 #include "xfs_log.h"
+#include "xfs_dinode.h"
 
 #include <linux/aio.h>
 #include <linux/dcache.h>
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 7f2c3c1c8ba4..12b6e7701985 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -16,19 +16,19 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_ag.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
 #include "xfs_inum.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
 #include "xfs_alloc.h"
 #include "xfs_mru_cache.h"
+#include "xfs_dinode.h"
 #include "xfs_filestream.h"
 #include "xfs_trace.h"
 
diff --git a/fs/xfs/xfs_format.h b/fs/xfs/xfs_format.h
index a790428f3bf4..b6ab5a3cfa12 100644
--- a/fs/xfs/xfs_format.h
+++ b/fs/xfs/xfs_format.h
@@ -156,4 +156,259 @@ struct xfs_dsymlink_hdr {
 	((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
 			sizeof(struct xfs_dsymlink_hdr) : 0))
 
+
+/*
+ * Allocation Btree format definitions
+ *
+ * There are two on-disk btrees, one sorted by blockno and one sorted
+ * by blockcount and blockno.  All blocks look the same to make the code
+ * simpler; if we have time later, we'll make the optimizations.
+ */
+#define	XFS_ABTB_MAGIC		0x41425442	/* 'ABTB' for bno tree */
+#define	XFS_ABTB_CRC_MAGIC	0x41423342	/* 'AB3B' */
+#define	XFS_ABTC_MAGIC		0x41425443	/* 'ABTC' for cnt tree */
+#define	XFS_ABTC_CRC_MAGIC	0x41423343	/* 'AB3C' */
+
+/*
+ * Data record/key structure
+ */
+typedef struct xfs_alloc_rec {
+	__be32		ar_startblock;	/* starting block number */
+	__be32		ar_blockcount;	/* count of free blocks */
+} xfs_alloc_rec_t, xfs_alloc_key_t;
+
+typedef struct xfs_alloc_rec_incore {
+	xfs_agblock_t	ar_startblock;	/* starting block number */
+	xfs_extlen_t	ar_blockcount;	/* count of free blocks */
+} xfs_alloc_rec_incore_t;
+
+/* btree pointer type */
+typedef __be32 xfs_alloc_ptr_t;
+
+/*
+ * Block numbers in the AG:
+ * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3.
+ */
+#define	XFS_BNO_BLOCK(mp)	((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
+#define	XFS_CNT_BLOCK(mp)	((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
+
+
+/*
+ * Inode Allocation Btree format definitions
+ *
+ * There is a btree for the inode map per allocation group.
+ */
+#define	XFS_IBT_MAGIC		0x49414254	/* 'IABT' */
+#define	XFS_IBT_CRC_MAGIC	0x49414233	/* 'IAB3' */
+
+typedef	__uint64_t	xfs_inofree_t;
+#define	XFS_INODES_PER_CHUNK		(NBBY * sizeof(xfs_inofree_t))
+#define	XFS_INODES_PER_CHUNK_LOG	(XFS_NBBYLOG + 3)
+#define	XFS_INOBT_ALL_FREE		((xfs_inofree_t)-1)
+#define	XFS_INOBT_MASK(i)		((xfs_inofree_t)1 << (i))
+
+static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
+{
+	return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
+}
+
+/*
+ * Data record structure
+ */
+typedef struct xfs_inobt_rec {
+	__be32		ir_startino;	/* starting inode number */
+	__be32		ir_freecount;	/* count of free inodes (set bits) */
+	__be64		ir_free;	/* free inode mask */
+} xfs_inobt_rec_t;
+
+typedef struct xfs_inobt_rec_incore {
+	xfs_agino_t	ir_startino;	/* starting inode number */
+	__int32_t	ir_freecount;	/* count of free inodes (set bits) */
+	xfs_inofree_t	ir_free;	/* free inode mask */
+} xfs_inobt_rec_incore_t;
+
+
+/*
+ * Key structure
+ */
+typedef struct xfs_inobt_key {
+	__be32		ir_startino;	/* starting inode number */
+} xfs_inobt_key_t;
+
+/* btree pointer type */
+typedef __be32 xfs_inobt_ptr_t;
+
+/*
+ * block numbers in the AG.
+ */
+#define	XFS_IBT_BLOCK(mp)		((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
+#define	XFS_PREALLOC_BLOCKS(mp)		((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
+
+
+
+/*
+ * BMAP Btree format definitions
+ *
+ * This includes both the root block definition that sits inside an inode fork
+ * and the record/pointer formats for the leaf/node in the blocks.
+ */
+#define XFS_BMAP_MAGIC		0x424d4150	/* 'BMAP' */
+#define XFS_BMAP_CRC_MAGIC	0x424d4133	/* 'BMA3' */
+
+/*
+ * Bmap root header, on-disk form only.
+ */
+typedef struct xfs_bmdr_block {
+	__be16		bb_level;	/* 0 is a leaf */
+	__be16		bb_numrecs;	/* current # of data records */
+} xfs_bmdr_block_t;
+
+/*
+ * Bmap btree record and extent descriptor.
+ *  l0:63 is an extent flag (value 1 indicates non-normal).
+ *  l0:9-62 are startoff.
+ *  l0:0-8 and l1:21-63 are startblock.
+ *  l1:0-20 are blockcount.
+ */
+#define BMBT_EXNTFLAG_BITLEN	1
+#define BMBT_STARTOFF_BITLEN	54
+#define BMBT_STARTBLOCK_BITLEN	52
+#define BMBT_BLOCKCOUNT_BITLEN	21
+
+typedef struct xfs_bmbt_rec {
+	__be64			l0, l1;
+} xfs_bmbt_rec_t;
+
+typedef __uint64_t	xfs_bmbt_rec_base_t;	/* use this for casts */
+typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
+
+typedef struct xfs_bmbt_rec_host {
+	__uint64_t		l0, l1;
+} xfs_bmbt_rec_host_t;
+
+/*
+ * Values and macros for delayed-allocation startblock fields.
+ */
+#define STARTBLOCKVALBITS	17
+#define STARTBLOCKMASKBITS	(15 + XFS_BIG_BLKNOS * 20)
+#define DSTARTBLOCKMASKBITS	(15 + 20)
+#define STARTBLOCKMASK		\
+	(((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+#define DSTARTBLOCKMASK		\
+	(((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+
+static inline int isnullstartblock(xfs_fsblock_t x)
+{
+	return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
+}
+
+static inline int isnulldstartblock(xfs_dfsbno_t x)
+{
+	return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK;
+}
+
+static inline xfs_fsblock_t nullstartblock(int k)
+{
+	ASSERT(k < (1 << STARTBLOCKVALBITS));
+	return STARTBLOCKMASK | (k);
+}
+
+static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
+{
+	return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
+}
+
+/*
+ * Possible extent formats.
+ */
+typedef enum {
+	XFS_EXTFMT_NOSTATE = 0,
+	XFS_EXTFMT_HASSTATE
+} xfs_exntfmt_t;
+
+/*
+ * Possible extent states.
+ */
+typedef enum {
+	XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
+	XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID
+} xfs_exntst_t;
+
+/*
+ * Incore version of above.
+ */
+typedef struct xfs_bmbt_irec
+{
+	xfs_fileoff_t	br_startoff;	/* starting file offset */
+	xfs_fsblock_t	br_startblock;	/* starting block number */
+	xfs_filblks_t	br_blockcount;	/* number of blocks */
+	xfs_exntst_t	br_state;	/* extent state */
+} xfs_bmbt_irec_t;
+
+/*
+ * Key structure for non-leaf levels of the tree.
+ */
+typedef struct xfs_bmbt_key {
+	__be64		br_startoff;	/* starting file offset */
+} xfs_bmbt_key_t, xfs_bmdr_key_t;
+
+/* btree pointer type */
+typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
+
+
+/*
+ * Generic Btree block format definitions
+ *
+ * This is a combination of the actual format used on disk for short and long
+ * format btrees.  The first three fields are shared by both format, but the
+ * pointers are different and should be used with care.
+ *
+ * To get the size of the actual short or long form headers please use the size
+ * macros below.  Never use sizeof(xfs_btree_block).
+ *
+ * The blkno, crc, lsn, owner and uuid fields are only available in filesystems
+ * with the crc feature bit, and all accesses to them must be conditional on
+ * that flag.
+ */
+struct xfs_btree_block {
+	__be32		bb_magic;	/* magic number for block type */
+	__be16		bb_level;	/* 0 is a leaf */
+	__be16		bb_numrecs;	/* current # of data records */
+	union {
+		struct {
+			__be32		bb_leftsib;
+			__be32		bb_rightsib;
+
+			__be64		bb_blkno;
+			__be64		bb_lsn;
+			uuid_t		bb_uuid;
+			__be32		bb_owner;
+			__le32		bb_crc;
+		} s;			/* short form pointers */
+		struct	{
+			__be64		bb_leftsib;
+			__be64		bb_rightsib;
+
+			__be64		bb_blkno;
+			__be64		bb_lsn;
+			uuid_t		bb_uuid;
+			__be64		bb_owner;
+			__le32		bb_crc;
+			__be32		bb_pad; /* padding for alignment */
+		} l;			/* long form pointers */
+	} bb_u;				/* rest */
+};
+
+#define XFS_BTREE_SBLOCK_LEN	16	/* size of a short form block */
+#define XFS_BTREE_LBLOCK_LEN	24	/* size of a long form block */
+
+/* sizes of CRC enabled btree blocks */
+#define XFS_BTREE_SBLOCK_CRC_LEN	(XFS_BTREE_SBLOCK_LEN + 40)
+#define XFS_BTREE_LBLOCK_CRC_LEN	(XFS_BTREE_LBLOCK_LEN + 48)
+
+#define XFS_BTREE_SBLOCK_CRC_OFF \
+	offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
+#define XFS_BTREE_LBLOCK_CRC_OFF \
+	offsetof(struct xfs_btree_block, bb_u.l.bb_crc)
+
 #endif /* __XFS_FORMAT_H__ */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index ceaec3165921..a6e54b3319bd 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -18,30 +18,28 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
-#include "xfs_log_format.h"
 #include "xfs_format.h"
+#include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
-#include "xfs_btree.h"
 #include "xfs_error.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_fsops.h"
 #include "xfs_itable.h"
 #include "xfs_trans_space.h"
 #include "xfs_rtalloc.h"
-#include "xfs_filestream.h"
 #include "xfs_trace.h"
 #include "xfs_log.h"
+#include "xfs_dinode.h"
+#include "xfs_filestream.h"
 
 /*
  * File system operations
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index c973ff8c4a7c..14d732f61a41 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -26,13 +26,10 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
 #include "xfs_alloc.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
@@ -42,6 +39,7 @@
 #include "xfs_buf_item.h"
 #include "xfs_icreate_item.h"
 #include "xfs_icache.h"
+#include "xfs_dinode.h"
 
 
 /*
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 155779803d67..a8f76a5ff418 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -23,6 +23,7 @@ struct xfs_dinode;
 struct xfs_imap;
 struct xfs_mount;
 struct xfs_trans;
+struct xfs_btree_cur;
 
 /*
  * Allocation parameters for inode allocation.
@@ -42,7 +43,7 @@ struct xfs_trans;
 static inline struct xfs_dinode *
 xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
 {
-	return (xfs_dinode_t *)
+	return (struct xfs_dinode *)
 		(xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog));
 }
 
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 2a0b6158e4c4..1fa142dc86cb 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -17,19 +17,17 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index cfbfe461abbc..f38b22011c4e 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -26,55 +26,6 @@ struct xfs_buf;
 struct xfs_btree_cur;
 struct xfs_mount;
 
-/*
- * There is a btree for the inode map per allocation group.
- */
-#define	XFS_IBT_MAGIC		0x49414254	/* 'IABT' */
-#define	XFS_IBT_CRC_MAGIC	0x49414233	/* 'IAB3' */
-
-typedef	__uint64_t	xfs_inofree_t;
-#define	XFS_INODES_PER_CHUNK		(NBBY * sizeof(xfs_inofree_t))
-#define	XFS_INODES_PER_CHUNK_LOG	(XFS_NBBYLOG + 3)
-#define	XFS_INOBT_ALL_FREE		((xfs_inofree_t)-1)
-#define	XFS_INOBT_MASK(i)		((xfs_inofree_t)1 << (i))
-
-static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
-{
-	return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
-}
-
-/*
- * Data record structure
- */
-typedef struct xfs_inobt_rec {
-	__be32		ir_startino;	/* starting inode number */
-	__be32		ir_freecount;	/* count of free inodes (set bits) */
-	__be64		ir_free;	/* free inode mask */
-} xfs_inobt_rec_t;
-
-typedef struct xfs_inobt_rec_incore {
-	xfs_agino_t	ir_startino;	/* starting inode number */
-	__int32_t	ir_freecount;	/* count of free inodes (set bits) */
-	xfs_inofree_t	ir_free;	/* free inode mask */
-} xfs_inobt_rec_incore_t;
-
-
-/*
- * Key structure
- */
-typedef struct xfs_inobt_key {
-	__be32		ir_startino;	/* starting inode number */
-} xfs_inobt_key_t;
-
-/* btree pointer type */
-typedef __be32 xfs_inobt_ptr_t;
-
-/*
- * block numbers in the AG.
- */
-#define	XFS_IBT_BLOCK(mp)		((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
-#define	XFS_PREALLOC_BLOCKS(mp)		((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
-
 /*
  * Btree block header size depends on a superblock flag.
  */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 6db11556731d..98d35244eecc 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -24,17 +24,13 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
-#include "xfs_dinode.h"
 #include "xfs_error.h"
-#include "xfs_filestream.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_inode_item.h"
 #include "xfs_quota.h"
 #include "xfs_trace.h"
-#include "xfs_fsops.h"
 #include "xfs_icache.h"
 #include "xfs_bmap_util.h"
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index f0440dcb5d8c..326b94dbe159 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -27,27 +27,22 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_inode.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_dir2.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
 #include "xfs_attr_sf.h"
 #include "xfs_attr.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
 #include "xfs_trans_space.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_inode_item.h"
-#include "xfs_btree.h"
-#include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
+#include "xfs_dinode.h"
 #include "xfs_filestream.h"
 #include "xfs_cksum.h"
 #include "xfs_trace.h"
@@ -55,6 +50,7 @@
 #include "xfs_symlink.h"
 #include "xfs_trans_priv.h"
 #include "xfs_log.h"
+#include "xfs_bmap_btree.h"
 
 kmem_zone_t *xfs_inode_zone;
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index cce62ce1a73a..66675877f38c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -24,7 +24,6 @@
 /*
  * Kernel only inode definitions
  */
-
 struct xfs_dinode;
 struct xfs_inode;
 struct xfs_buf;
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c
index c0f974b6cd46..4fc9f39dd89e 100644
--- a/fs/xfs/xfs_inode_buf.c
+++ b/fs/xfs/xfs_inode_buf.c
@@ -24,15 +24,13 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_error.h"
 #include "xfs_cksum.h"
 #include "xfs_icache.h"
-#include "xfs_ialloc.h"
 #include "xfs_trans.h"
+#include "xfs_ialloc.h"
+#include "xfs_dinode.h"
 
 /*
  * Check that none of the inode's in the buffer have a next
diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c
index b89b58a3a350..22c9837c5d4b 100644
--- a/fs/xfs/xfs_inode_fork.c
+++ b/fs/xfs/xfs_inode_fork.c
@@ -26,25 +26,15 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
-#include "xfs_buf_item.h"
 #include "xfs_inode_item.h"
-#include "xfs_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_ialloc.h"
+#include "xfs_bmap_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_error.h"
-#include "xfs_quota.h"
-#include "xfs_filestream.h"
-#include "xfs_cksum.h"
 #include "xfs_trace.h"
-#include "xfs_icache.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
 
 kmem_zone_t *xfs_ifork_zone;
 
diff --git a/fs/xfs/xfs_inode_fork.h b/fs/xfs/xfs_inode_fork.h
index 28661a0d9058..eb329a1ea888 100644
--- a/fs/xfs/xfs_inode_fork.h
+++ b/fs/xfs/xfs_inode_fork.h
@@ -19,6 +19,7 @@
 #define	__XFS_INODE_FORK_H__
 
 struct xfs_inode_log_item;
+struct xfs_dinode;
 
 /*
  * The following xfs_ext_irec_t struct introduces a second (top) level
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 9f8e85ef2681..7c0d391f9a6e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -17,19 +17,19 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_trans_priv.h"
+#include "xfs_dinode.h"
 
 
 kmem_zone_t	*xfs_ili_zone;		/* inode log item zone */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index ea65a73d49ba..4d613401a5e0 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -23,28 +23,25 @@
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_alloc.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_ioctl.h"
+#include "xfs_alloc.h"
 #include "xfs_rtalloc.h"
 #include "xfs_itable.h"
 #include "xfs_error.h"
 #include "xfs_attr.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
-#include "xfs_trans.h"
-#include "xfs_buf_item.h"
 #include "xfs_fsops.h"
 #include "xfs_discard.h"
 #include "xfs_quota.h"
-#include "xfs_inode_item.h"
 #include "xfs_export.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_symlink.h"
+#include "xfs_dinode.h"
+#include "xfs_trans.h"
 
 #include <linux/capability.h>
 #include <linux/dcache.h>
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 1be3333b260c..e8fb1231db81 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -22,14 +22,13 @@
 #include <asm/uaccess.h>
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
 #include "xfs_vnode.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_itable.h"
 #include "xfs_error.h"
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index adadf80709b8..22d1cbea283d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -24,29 +24,21 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
 #include "xfs_btree.h"
+#include "xfs_bmap_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
-#include "xfs_rtalloc.h"
 #include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
+#include "xfs_trans.h"
 #include "xfs_trans_space.h"
 #include "xfs_iomap.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
+#include "xfs_quota.h"
 #include "xfs_dquot_item.h"
 #include "xfs_dquot.h"
+#include "xfs_dinode.h"
 
 
 #define XFS_WRITEIO_ALIGN(mp,off)	(((off) >> mp->m_writeio_log) \
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 3f2ae19c8ca0..718b62b0fe05 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -25,26 +25,20 @@
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
 #include "xfs_acl.h"
-#include "xfs_alloc.h"
 #include "xfs_quota.h"
-#include "xfs_rtalloc.h"
 #include "xfs_error.h"
-#include "xfs_itable.h"
 #include "xfs_attr.h"
 #include "xfs_trans.h"
-#include "xfs_buf_item.h"
-#include "xfs_inode_item.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_symlink.h"
 #include "xfs_da_btree.h"
 #include "xfs_dir2_priv.h"
+#include "xfs_dinode.h"
 
 #include <linux/capability.h>
 #include <linux/xattr.h>
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 964b1caae085..c237ad15d500 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -18,23 +18,22 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_inum.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_btree.h"
 #include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
 #include "xfs_itable.h"
 #include "xfs_error.h"
-#include "xfs_btree.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
+#include "xfs_dinode.h"
 
 STATIC int
 xfs_internal_inum(
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 5275ad886971..e523396753c5 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
@@ -28,12 +29,7 @@
 #include "xfs_trans_priv.h"
 #include "xfs_log.h"
 #include "xfs_log_priv.h"
-#include "xfs_buf_item.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
 #include "xfs_log_recover.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_trace.h"
 #include "xfs_fsops.h"
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 832ba5d5eb3a..b6b669df40f3 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -27,29 +27,23 @@
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
-#include "xfs_error.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_alloc.h"
-#include "xfs_ialloc.h"
 #include "xfs_log.h"
 #include "xfs_log_priv.h"
 #include "xfs_log_recover.h"
+#include "xfs_inode_item.h"
 #include "xfs_extfree_item.h"
 #include "xfs_trans_priv.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
 #include "xfs_quota.h"
 #include "xfs_cksum.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
-
-/* Need all the magic numbers and buffer ops structures from these headers */
-#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_error.h"
 #include "xfs_dir2.h"
 
 #define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)
diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/xfs_log_rlimit.c
index 3749bc1c00a4..2af1a0a4d0f1 100644
--- a/fs/xfs/xfs_log_rlimit.c
+++ b/fs/xfs/xfs_log_rlimit.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_ag.h"
@@ -25,10 +26,10 @@
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_trans_space.h"
-#include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
 #include "xfs_da_btree.h"
 #include "xfs_attr_leaf.h"
+#include "xfs_bmap_btree.h"
 
 /*
  * Calculate the maximum length in bytes that would be required for a local
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 83f7810c53cd..da88f167af78 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -27,27 +27,20 @@
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-#include "xfs_log.h"
-#include "xfs_dir2.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_btree.h"
+#include "xfs_dir2.h"
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_rtalloc.h"
 #include "xfs_bmap.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
 #include "xfs_fsops.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
-#include "xfs_cksum.h"
-#include "xfs_buf_item.h"
 
 
 #ifdef HAVE_PERCPU_SB
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index e2f675644b4d..14a4996cfec6 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -17,33 +17,28 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
-#include "xfs_shared.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_ialloc.h"
 #include "xfs_itable.h"
-#include "xfs_alloc.h"
-#include "xfs_rtalloc.h"
 #include "xfs_quota.h"
 #include "xfs_error.h"
 #include "xfs_bmap.h"
-#include "xfs_attr.h"
+#include "xfs_bmap_btree.h"
 #include "xfs_trans.h"
-#include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
 #include "xfs_qm.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_cksum.h"
+#include "xfs_dinode.h"
 
 /*
  * The global quota manager. There is only one of these for the entire
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 704628961159..e9be63abd8d2 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -22,18 +22,11 @@
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_alloc.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
-#include "xfs_itable.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
 #include "xfs_error.h"
-#include "xfs_attr.h"
 #include "xfs_trans.h"
-#include "xfs_buf_item.h"
 #include "xfs_qm.h"
 
 
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index ab2326aaaeda..437c9198031a 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -27,19 +27,11 @@
 #include "xfs_bit.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_itable.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
 #include "xfs_error.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
+#include "xfs_quota.h"
 #include "xfs_qm.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 2dd467b0b475..af33cafe69b6 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -22,10 +22,9 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_inode.h"
 #include "xfs_quota.h"
 #include "xfs_trans.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
 #include "xfs_qm.h"
 #include <linux/quota.h>
 
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 23c5ca24a02d..d5da15578185 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -25,21 +25,18 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_alloc.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
-#include "xfs_rtalloc.h"
-#include "xfs_fsops.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc.h"
 #include "xfs_error.h"
 #include "xfs_trans.h"
-#include "xfs_inode_item.h"
 #include "xfs_trans_space.h"
 #include "xfs_trace.h"
 #include "xfs_buf.h"
 #include "xfs_icache.h"
+#include "xfs_dinode.h"
 
 
 /*
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
index 1b0a6420f1b8..05b5493d2baa 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/xfs_sb.c
@@ -21,31 +21,21 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_inum.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_dir2.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
-#include "xfs_rtalloc.h"
-#include "xfs_bmap.h"
 #include "xfs_error.h"
-#include "xfs_quota.h"
-#include "xfs_fsops.h"
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
 #include "xfs_trans.h"
-#include "xfs_trans_priv.h"
 #include "xfs_buf_item.h"
+#include "xfs_dinode.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
 
 /*
  * Physical superblock buffer manipulations. Shared with libxfs in userspace.
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 4e20b420ff72..4eb63ad87d7d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -24,29 +24,18 @@
 #include "xfs_inum.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
+#include "xfs_alloc.h"
 #include "xfs_error.h"
-#include "xfs_itable.h"
 #include "xfs_fsops.h"
-#include "xfs_attr.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_log.h"
 #include "xfs_log_priv.h"
-#include "xfs_trans_priv.h"
-#include "xfs_filestream.h"
 #include "xfs_da_btree.h"
 #include "xfs_dir2.h"
 #include "xfs_extfree_item.h"
@@ -55,6 +44,9 @@
 #include "xfs_icache.h"
 #include "xfs_trace.h"
 #include "xfs_icreate_item.h"
+#include "xfs_dinode.h"
+#include "xfs_filestream.h"
+#include "xfs_quota.h"
 
 #include <linux/namei.h>
 #include <linux/init.h>
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 93fc72c96fad..14e58f2c96bd 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -28,13 +28,11 @@
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_dir2.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
 #include "xfs_bmap_util.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
@@ -42,8 +40,8 @@
 #include "xfs_trace.h"
 #include "xfs_symlink.h"
 #include "xfs_trans.h"
-#include "xfs_buf_item.h"
 #include "xfs_log.h"
+#include "xfs_dinode.h"
 
 /* ----- Kernel only functions below ----- */
 STATIC int
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 889bb4233323..dee3279c095e 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -25,10 +25,6 @@
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_da_btree.h"
@@ -49,6 +45,7 @@
 #include "xfs_dquot.h"
 #include "xfs_log_recover.h"
 #include "xfs_inode_item.h"
+#include "xfs_bmap_btree.h"
 
 /*
  * We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 8d1595f9fd50..c812c5c060de 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -25,28 +25,14 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_error.h"
-#include "xfs_da_format.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_alloc.h"
 #include "xfs_extent_busy.h"
-#include "xfs_bmap.h"
 #include "xfs_quota.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
-#include "xfs_trans_space.h"
-#include "xfs_qm.h"
-#include "xfs_inode_item.h"
 #include "xfs_log.h"
-#include "xfs_log_priv.h"
-#include "xfs_buf_item.h"
 #include "xfs_trace.h"
+#include "xfs_error.h"
 
 kmem_zone_t	*xfs_trans_zone;
 kmem_zone_t	*xfs_log_item_desc_zone;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 9cc5fbed4f26..9b96d35e483d 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -234,6 +234,11 @@ void		xfs_trans_cancel(xfs_trans_t *, int);
 int		xfs_trans_ail_init(struct xfs_mount *);
 void		xfs_trans_ail_destroy(struct xfs_mount *);
 
+void		xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *,
+				       enum xfs_blft);
+void		xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
+					struct xfs_buf *src_bp);
+
 extern kmem_zone_t	*xfs_trans_zone;
 extern kmem_zone_t	*xfs_log_item_desc_zone;
 
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 6b1b993257d6..c035d11b7734 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -18,15 +18,12 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 2025f6eea4aa..cd2a10e15d3a 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -23,19 +23,12 @@
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
-#include "xfs_itable.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
 #include "xfs_error.h"
-#include "xfs_attr.h"
 #include "xfs_trans.h"
-#include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
+#include "xfs_quota.h"
 #include "xfs_qm.h"
 
 STATIC void	xfs_trans_alloc_dqinfo(xfs_trans_t *);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index b1a0efa6b32a..1bba7f60d94c 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -18,17 +18,13 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_btree.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_inode_item.h"
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
index 89de046331da..d53d9f0627a7 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/xfs_trans_resv.c
@@ -26,19 +26,9 @@
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
-#include "xfs_error.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_btree.h"
+#include "xfs_bmap_btree.h"
 #include "xfs_ialloc.h"
-#include "xfs_alloc.h"
-#include "xfs_extent_busy.h"
-#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
 #include "xfs_quota.h"
 #include "xfs_trans.h"
 #include "xfs_qm.h"
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 23052992c7e2..9d479073ba41 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -17,13 +17,13 @@
  */
 
 #include "xfs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
-#include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
 #include "xfs_attr.h"
 #include "xfs_attr_leaf.h"
-- 
cgit v1.3


From c963c6193a5adaec58044e238ef23516d04e5a74 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 15 Oct 2013 09:17:56 +1100
Subject: xfs: split xfs_rtalloc.c for userspace sanity

xfs_rtalloc.c is partially shared with userspace. Split the file up
into two parts - one that is kernel private and the other which is
wholly shared with userspace.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/Makefile       |    6 +-
 fs/xfs/xfs_rtalloc.c  | 1845 ++++++++++++-------------------------------------
 fs/xfs/xfs_rtalloc.h  |   24 +
 fs/xfs/xfs_rtbitmap.c |  973 ++++++++++++++++++++++++++
 4 files changed, 1446 insertions(+), 1402 deletions(-)
 create mode 100644 fs/xfs/xfs_rtbitmap.c

(limited to 'fs')

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 08f11bbf56ae..33a69fabfd83 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -104,7 +104,11 @@ xfs-$(CONFIG_XFS_QUOTA)		+= xfs_dquot.o \
 				   xfs_qm_bhv.o \
 				   xfs_qm.o \
 				   xfs_quotaops.o
-xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
+
+# xfs_rtbitmap is shared with libxfs
+xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o \
+				   xfs_rtbitmap.o
+
 xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
 xfs-$(CONFIG_PROC_FS)		+= xfs_stats.o
 xfs-$(CONFIG_SYSCTL)		+= xfs_sysctl.o
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index d5da15578185..a6a76b2b6a85 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -37,247 +37,336 @@
 #include "xfs_buf.h"
 #include "xfs_icache.h"
 #include "xfs_dinode.h"
+#include "xfs_rtalloc.h"
 
 
 /*
- * Prototypes for internal functions.
- */
-
-
-STATIC int xfs_rtallocate_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
-		xfs_extlen_t, xfs_buf_t **, xfs_fsblock_t *);
-STATIC int xfs_rtany_summary(xfs_mount_t *, xfs_trans_t *, int, int,
-		xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, int *);
-STATIC int xfs_rtcheck_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
-		xfs_extlen_t, int, xfs_rtblock_t *, int *);
-STATIC int xfs_rtfind_back(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
-		xfs_rtblock_t, xfs_rtblock_t *);
-STATIC int xfs_rtfind_forw(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
-		xfs_rtblock_t, xfs_rtblock_t *);
-STATIC int xfs_rtget_summary( xfs_mount_t *, xfs_trans_t *, int,
-		xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, xfs_suminfo_t *);
-STATIC int xfs_rtmodify_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
-		xfs_extlen_t, int);
-STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int,
-		xfs_rtblock_t, int, xfs_buf_t **, xfs_fsblock_t *);
-
-/*
- * Internal functions.
- */
-
-/*
- * Allocate space to the bitmap or summary file, and zero it, for growfs.
+ * Read and return the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
  */
 STATIC int				/* error */
-xfs_growfs_rt_alloc(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_extlen_t	oblocks,	/* old count of blocks */
-	xfs_extlen_t	nblocks,	/* new count of blocks */
-	xfs_inode_t	*ip)		/* inode (bitmap/summary) */
+xfs_rtget_summary(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	int		log,		/* log2 of extent size */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_suminfo_t	*sum)		/* out: summary info for this block */
 {
-	xfs_fileoff_t	bno;		/* block number in file */
-	xfs_buf_t	*bp;		/* temporary buffer for zeroing */
-	int		committed;	/* transaction committed flag */
-	xfs_daddr_t	d;		/* disk block address */
-	int		error;		/* error return value */
-	xfs_fsblock_t	firstblock;	/* first block allocated in xaction */
-	xfs_bmap_free_t	flist;		/* list of freed blocks */
-	xfs_fsblock_t	fsbno;		/* filesystem block for bno */
-	xfs_bmbt_irec_t	map;		/* block map output */
-	int		nmap;		/* number of block maps */
-	int		resblks;	/* space reservation */
+	xfs_buf_t	*bp;		/* buffer for summary block */
+	int		error;		/* error value */
+	xfs_fsblock_t	sb;		/* summary fsblock */
+	int		so;		/* index into the summary file */
+	xfs_suminfo_t	*sp;		/* pointer to returned data */
 
 	/*
-	 * Allocate space to the file, as necessary.
+	 * Compute entry number in the summary file.
 	 */
-	while (oblocks < nblocks) {
-		int		cancelflags = 0;
-		xfs_trans_t	*tp;
-
-		tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
-		resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
-		/*
-		 * Reserve space & log for one extent added to the file.
-		 */
-		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
-					  resblks, 0);
-		if (error)
-			goto error_cancel;
-		cancelflags = XFS_TRANS_RELEASE_LOG_RES;
-		/*
-		 * Lock the inode.
-		 */
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
-		xfs_bmap_init(&flist, &firstblock);
-		/*
-		 * Allocate blocks to the bitmap file.
-		 */
-		nmap = 1;
-		cancelflags |= XFS_TRANS_ABORT;
-		error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
-					XFS_BMAPI_METADATA, &firstblock,
-					resblks, &map, &nmap, &flist);
-		if (!error && nmap < 1)
-			error = XFS_ERROR(ENOSPC);
-		if (error)
-			goto error_cancel;
-		/*
-		 * Free any blocks freed up in the transaction, then commit.
-		 */
-		error = xfs_bmap_finish(&tp, &flist, &committed);
-		if (error)
-			goto error_cancel;
-		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-		if (error)
-			goto error;
+	so = XFS_SUMOFFS(mp, log, bbno);
+	/*
+	 * Compute the block number in the summary file.
+	 */
+	sb = XFS_SUMOFFSTOBLOCK(mp, so);
+	/*
+	 * If we have an old buffer, and the block number matches, use that.
+	 */
+	if (rbpp && *rbpp && *rsb == sb)
+		bp = *rbpp;
+	/*
+	 * Otherwise we have to get the buffer.
+	 */
+	else {
 		/*
-		 * Now we need to clear the allocated blocks.
-		 * Do this one block per transaction, to keep it simple.
+		 * If there was an old one, get rid of it first.
 		 */
-		cancelflags = 0;
-		for (bno = map.br_startoff, fsbno = map.br_startblock;
-		     bno < map.br_startoff + map.br_blockcount;
-		     bno++, fsbno++) {
-			tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO);
-			/*
-			 * Reserve log for one block zeroing.
-			 */
-			error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero,
-						  0, 0);
-			if (error)
-				goto error_cancel;
-			/*
-			 * Lock the bitmap inode.
-			 */
-			xfs_ilock(ip, XFS_ILOCK_EXCL);
-			xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-			/*
-			 * Get a buffer for the block.
-			 */
-			d = XFS_FSB_TO_DADDR(mp, fsbno);
-			bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
-				mp->m_bsize, 0);
-			if (bp == NULL) {
-				error = XFS_ERROR(EIO);
-error_cancel:
-				xfs_trans_cancel(tp, cancelflags);
-				goto error;
-			}
-			memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
-			xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
-			/*
-			 * Commit the transaction.
-			 */
-			error = xfs_trans_commit(tp, 0);
-			if (error)
-				goto error;
+		if (rbpp && *rbpp)
+			xfs_trans_brelse(tp, *rbpp);
+		error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
+		if (error) {
+			return error;
 		}
 		/*
-		 * Go on to the next extent, if any.
+		 * Remember this buffer and block for the next call.
 		 */
-		oblocks = map.br_startoff + map.br_blockcount;
+		if (rbpp) {
+			*rbpp = bp;
+			*rsb = sb;
+		}
 	}
+	/*
+	 * Point to the summary information & copy it out.
+	 */
+	sp = XFS_SUMPTR(mp, bp, so);
+	*sum = *sp;
+	/*
+	 * Drop the buffer if we're not asked to remember it.
+	 */
+	if (!rbpp)
+		xfs_trans_brelse(tp, bp);
 	return 0;
-
-error:
-	return error;
 }
 
+
 /*
- * Attempt to allocate an extent minlen<=len<=maxlen starting from
- * bitmap block bbno.  If we don't get maxlen then use prod to trim
- * the length, if given.  Returns error; returns starting block in *rtblock.
- * The lengths are all in rtextents.
+ * Return whether there are any free extents in the size range given
+ * by low and high, for the bitmap block bbno.
  */
 STATIC int				/* error */
-xfs_rtallocate_extent_block(
-	xfs_mount_t	*mp,		/* file system mount point */
+xfs_rtany_summary(
+	xfs_mount_t	*mp,		/* file system mount structure */
 	xfs_trans_t	*tp,		/* transaction pointer */
+	int		low,		/* low log2 extent size */
+	int		high,		/* high log2 extent size */
 	xfs_rtblock_t	bbno,		/* bitmap block number */
-	xfs_extlen_t	minlen,		/* minimum length to allocate */
-	xfs_extlen_t	maxlen,		/* maximum length to allocate */
-	xfs_extlen_t	*len,		/* out: actual length allocated */
-	xfs_rtblock_t	*nextp,		/* out: next block to try */
 	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
 	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
-	xfs_extlen_t	prod,		/* extent product factor */
-	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+	int		*stat)		/* out: any good extents here? */
 {
-	xfs_rtblock_t	besti;		/* best rtblock found so far */
-	xfs_rtblock_t	bestlen;	/* best length found so far */
-	xfs_rtblock_t	end;		/* last rtblock in chunk */
 	int		error;		/* error value */
-	xfs_rtblock_t	i;		/* current rtblock trying */
-	xfs_rtblock_t	next;		/* next rtblock to try */
-	int		stat;		/* status from internal calls */
+	int		log;		/* loop counter, log2 of ext. size */
+	xfs_suminfo_t	sum;		/* summary data */
 
 	/*
-	 * Loop over all the extents starting in this bitmap block,
-	 * looking for one that's long enough.
+	 * Loop over logs of extent sizes.  Order is irrelevant.
 	 */
-	for (i = XFS_BLOCKTOBIT(mp, bbno), besti = -1, bestlen = 0,
-		end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1;
-	     i <= end;
-	     i++) {
+	for (log = low; log <= high; log++) {
 		/*
-		 * See if there's a free extent of maxlen starting at i.
-		 * If it's not so then next will contain the first non-free.
+		 * Get one summary datum.
 		 */
-		error = xfs_rtcheck_range(mp, tp, i, maxlen, 1, &next, &stat);
+		error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum);
 		if (error) {
 			return error;
 		}
-		if (stat) {
-			/*
-			 * i for maxlen is all free, allocate and return that.
-			 */
-			error = xfs_rtallocate_range(mp, tp, i, maxlen, rbpp,
-				rsb);
-			if (error) {
-				return error;
-			}
-			*len = maxlen;
-			*rtblock = i;
-			return 0;
-		}
 		/*
-		 * In the case where we have a variable-sized allocation
-		 * request, figure out how big this free piece is,
-		 * and if it's big enough for the minimum, and the best
-		 * so far, remember it.
+		 * If there are any, return success.
 		 */
-		if (minlen < maxlen) {
-			xfs_rtblock_t	thislen;	/* this extent size */
-
-			thislen = next - i;
-			if (thislen >= minlen && thislen > bestlen) {
-				besti = i;
-				bestlen = thislen;
-			}
+		if (sum) {
+			*stat = 1;
+			return 0;
 		}
-		/*
-		 * If not done yet, find the start of the next free space.
-		 */
-		if (next < end) {
-			error = xfs_rtfind_forw(mp, tp, next, end, &i);
-			if (error) {
-				return error;
-			}
-		} else
-			break;
 	}
 	/*
-	 * Searched the whole thing & didn't find a maxlen free extent.
+	 * Found nothing, return failure.
 	 */
-	if (minlen < maxlen && besti != -1) {
-		xfs_extlen_t	p;	/* amount to trim length by */
+	*stat = 0;
+	return 0;
+}
 
-		/*
-		 * If size should be a multiple of prod, make that so.
-		 */
-		if (prod > 1 && (p = do_mod(bestlen, prod)))
+
+/*
+ * Copy and transform the summary file, given the old and new
+ * parameters in the mount structures.
+ */
+STATIC int				/* error */
+xfs_rtcopy_summary(
+	xfs_mount_t	*omp,		/* old file system mount point */
+	xfs_mount_t	*nmp,		/* new file system mount point */
+	xfs_trans_t	*tp)		/* transaction pointer */
+{
+	xfs_rtblock_t	bbno;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* summary buffer */
+	int		error;		/* error return value */
+	int		log;		/* summary level number (log length) */
+	xfs_suminfo_t	sum;		/* summary data */
+	xfs_fsblock_t	sumbno;		/* summary block number */
+
+	bp = NULL;
+	for (log = omp->m_rsumlevels - 1; log >= 0; log--) {
+		for (bbno = omp->m_sb.sb_rbmblocks - 1;
+		     (xfs_srtblock_t)bbno >= 0;
+		     bbno--) {
+			error = xfs_rtget_summary(omp, tp, log, bbno, &bp,
+				&sumbno, &sum);
+			if (error)
+				return error;
+			if (sum == 0)
+				continue;
+			error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum,
+				&bp, &sumbno);
+			if (error)
+				return error;
+			error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum,
+				&bp, &sumbno);
+			if (error)
+				return error;
+			ASSERT(sum > 0);
+		}
+	}
+	return 0;
+}
+/*
+ * Mark an extent specified by start and len allocated.
+ * Updates all the summary information as well as the bitmap.
+ */
+STATIC int				/* error */
+xfs_rtallocate_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* start block to allocate */
+	xfs_extlen_t	len,		/* length to allocate */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+{
+	xfs_rtblock_t	end;		/* end of the allocated extent */
+	int		error;		/* error value */
+	xfs_rtblock_t	postblock = 0;	/* first block allocated > end */
+	xfs_rtblock_t	preblock = 0;	/* first block allocated < start */
+
+	end = start + len - 1;
+	/*
+	 * Assume we're allocating out of the middle of a free extent.
+	 * We need to find the beginning and end of the extent so we can
+	 * properly update the summary.
+	 */
+	error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Find the next allocated block (end of free extent).
+	 */
+	error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+		&postblock);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Decrement the summary information corresponding to the entire
+	 * (old) free extent.
+	 */
+	error = xfs_rtmodify_summary(mp, tp,
+		XFS_RTBLOCKLOG(postblock + 1 - preblock),
+		XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+	if (error) {
+		return error;
+	}
+	/*
+	 * If there are blocks not being allocated at the front of the
+	 * old extent, add summary data for them to be free.
+	 */
+	if (preblock < start) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(start - preblock),
+			XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * If there are blocks not being allocated at the end of the
+	 * old extent, add summary data for them to be free.
+	 */
+	if (postblock > end) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(postblock - end),
+			XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * Modify the bitmap to mark this extent allocated.
+	 */
+	error = xfs_rtmodify_range(mp, tp, start, len, 0);
+	return error;
+}
+
+/*
+ * Attempt to allocate an extent minlen<=len<=maxlen starting from
+ * bitmap block bbno.  If we don't get maxlen then use prod to trim
+ * the length, if given.  Returns error; returns starting block in *rtblock.
+ * The lengths are all in rtextents.
+ */
+STATIC int				/* error */
+xfs_rtallocate_extent_block(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	xfs_rtblock_t	*nextp,		/* out: next block to try */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	xfs_rtblock_t	besti;		/* best rtblock found so far */
+	xfs_rtblock_t	bestlen;	/* best length found so far */
+	xfs_rtblock_t	end;		/* last rtblock in chunk */
+	int		error;		/* error value */
+	xfs_rtblock_t	i;		/* current rtblock trying */
+	xfs_rtblock_t	next;		/* next rtblock to try */
+	int		stat;		/* status from internal calls */
+
+	/*
+	 * Loop over all the extents starting in this bitmap block,
+	 * looking for one that's long enough.
+	 */
+	for (i = XFS_BLOCKTOBIT(mp, bbno), besti = -1, bestlen = 0,
+		end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1;
+	     i <= end;
+	     i++) {
+		/*
+		 * See if there's a free extent of maxlen starting at i.
+		 * If it's not so then next will contain the first non-free.
+		 */
+		error = xfs_rtcheck_range(mp, tp, i, maxlen, 1, &next, &stat);
+		if (error) {
+			return error;
+		}
+		if (stat) {
+			/*
+			 * i for maxlen is all free, allocate and return that.
+			 */
+			error = xfs_rtallocate_range(mp, tp, i, maxlen, rbpp,
+				rsb);
+			if (error) {
+				return error;
+			}
+			*len = maxlen;
+			*rtblock = i;
+			return 0;
+		}
+		/*
+		 * In the case where we have a variable-sized allocation
+		 * request, figure out how big this free piece is,
+		 * and if it's big enough for the minimum, and the best
+		 * so far, remember it.
+		 */
+		if (minlen < maxlen) {
+			xfs_rtblock_t	thislen;	/* this extent size */
+
+			thislen = next - i;
+			if (thislen >= minlen && thislen > bestlen) {
+				besti = i;
+				bestlen = thislen;
+			}
+		}
+		/*
+		 * If not done yet, find the start of the next free space.
+		 */
+		if (next < end) {
+			error = xfs_rtfind_forw(mp, tp, next, end, &i);
+			if (error) {
+				return error;
+			}
+		} else
+			break;
+	}
+	/*
+	 * Searched the whole thing & didn't find a maxlen free extent.
+	 */
+	if (minlen < maxlen && besti != -1) {
+		xfs_extlen_t	p;	/* amount to trim length by */
+
+		/*
+		 * If size should be a multiple of prod, make that so.
+		 */
+		if (prod > 1 && (p = do_mod(bestlen, prod)))
 			bestlen -= p;
 		/*
 		 * Allocate besti for bestlen & return that.
@@ -638,1191 +727,205 @@ xfs_rtallocate_extent_size(
 			 */
 			if (r != NULLRTBLOCK) {
 				*rtblock = r;
-				return 0;
-			}
-			/*
-			 * If the "next block to try" returned from the
-			 * allocator is beyond the next bitmap block,
-			 * skip to that bitmap block.
-			 */
-			if (XFS_BITTOBLOCK(mp, n) > i + 1)
-				i = XFS_BITTOBLOCK(mp, n) - 1;
-		}
-	}
-	/*
-	 * Didn't find any maxlen blocks.  Try smaller ones, unless
-	 * we're asking for a fixed size extent.
-	 */
-	if (minlen > --maxlen) {
-		*rtblock = NULLRTBLOCK;
-		return 0;
-	}
-	ASSERT(minlen != 0);
-	ASSERT(maxlen != 0);
-
-	/*
-	 * Loop over sizes, from maxlen down to minlen.
-	 * This time, when we do the allocations, allow smaller ones
-	 * to succeed.
-	 */
-	for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) {
-		/*
-		 * Loop over all the bitmap blocks, try an allocation
-		 * starting in that block.
-		 */
-		for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
-			/*
-			 * Get the summary information for this level/block.
-			 */
-			error =	xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
-						  &sum);
-			if (error) {
-				return error;
-			}
-			/*
-			 * If nothing there, go on to next.
-			 */
-			if (!sum)
-				continue;
-			/*
-			 * Try the allocation.  Make sure the specified
-			 * minlen/maxlen are in the possible range for
-			 * this summary level.
-			 */
-			error = xfs_rtallocate_extent_block(mp, tp, i,
-					XFS_RTMAX(minlen, 1 << l),
-					XFS_RTMIN(maxlen, (1 << (l + 1)) - 1),
-					len, &n, rbpp, rsb, prod, &r);
-			if (error) {
-				return error;
-			}
-			/*
-			 * If it worked, return that extent.
-			 */
-			if (r != NULLRTBLOCK) {
-				*rtblock = r;
-				return 0;
-			}
-			/*
-			 * If the "next block to try" returned from the
-			 * allocator is beyond the next bitmap block,
-			 * skip to that bitmap block.
-			 */
-			if (XFS_BITTOBLOCK(mp, n) > i + 1)
-				i = XFS_BITTOBLOCK(mp, n) - 1;
-		}
-	}
-	/*
-	 * Got nothing, return failure.
-	 */
-	*rtblock = NULLRTBLOCK;
-	return 0;
-}
-
-/*
- * Mark an extent specified by start and len allocated.
- * Updates all the summary information as well as the bitmap.
- */
-STATIC int				/* error */
-xfs_rtallocate_range(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	start,		/* start block to allocate */
-	xfs_extlen_t	len,		/* length to allocate */
-	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
-	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
-{
-	xfs_rtblock_t	end;		/* end of the allocated extent */
-	int		error;		/* error value */
-	xfs_rtblock_t	postblock = 0;	/* first block allocated > end */
-	xfs_rtblock_t	preblock = 0;	/* first block allocated < start */
-
-	end = start + len - 1;
-	/*
-	 * Assume we're allocating out of the middle of a free extent.
-	 * We need to find the beginning and end of the extent so we can
-	 * properly update the summary.
-	 */
-	error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
-	if (error) {
-		return error;
-	}
-	/*
-	 * Find the next allocated block (end of free extent).
-	 */
-	error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
-		&postblock);
-	if (error) {
-		return error;
-	}
-	/*
-	 * Decrement the summary information corresponding to the entire
-	 * (old) free extent.
-	 */
-	error = xfs_rtmodify_summary(mp, tp,
-		XFS_RTBLOCKLOG(postblock + 1 - preblock),
-		XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
-	if (error) {
-		return error;
-	}
-	/*
-	 * If there are blocks not being allocated at the front of the
-	 * old extent, add summary data for them to be free.
-	 */
-	if (preblock < start) {
-		error = xfs_rtmodify_summary(mp, tp,
-			XFS_RTBLOCKLOG(start - preblock),
-			XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
-		if (error) {
-			return error;
-		}
-	}
-	/*
-	 * If there are blocks not being allocated at the end of the
-	 * old extent, add summary data for them to be free.
-	 */
-	if (postblock > end) {
-		error = xfs_rtmodify_summary(mp, tp,
-			XFS_RTBLOCKLOG(postblock - end),
-			XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb);
-		if (error) {
-			return error;
-		}
-	}
-	/*
-	 * Modify the bitmap to mark this extent allocated.
-	 */
-	error = xfs_rtmodify_range(mp, tp, start, len, 0);
-	return error;
-}
-
-/*
- * Return whether there are any free extents in the size range given
- * by low and high, for the bitmap block bbno.
- */
-STATIC int				/* error */
-xfs_rtany_summary(
-	xfs_mount_t	*mp,		/* file system mount structure */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	int		low,		/* low log2 extent size */
-	int		high,		/* high log2 extent size */
-	xfs_rtblock_t	bbno,		/* bitmap block number */
-	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
-	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
-	int		*stat)		/* out: any good extents here? */
-{
-	int		error;		/* error value */
-	int		log;		/* loop counter, log2 of ext. size */
-	xfs_suminfo_t	sum;		/* summary data */
-
-	/*
-	 * Loop over logs of extent sizes.  Order is irrelevant.
-	 */
-	for (log = low; log <= high; log++) {
-		/*
-		 * Get one summary datum.
-		 */
-		error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum);
-		if (error) {
-			return error;
-		}
-		/*
-		 * If there are any, return success.
-		 */
-		if (sum) {
-			*stat = 1;
-			return 0;
-		}
-	}
-	/*
-	 * Found nothing, return failure.
-	 */
-	*stat = 0;
-	return 0;
-}
-
-/*
- * Get a buffer for the bitmap or summary file block specified.
- * The buffer is returned read and locked.
- */
-STATIC int				/* error */
-xfs_rtbuf_get(
-	xfs_mount_t	*mp,		/* file system mount structure */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	block,		/* block number in bitmap or summary */
-	int		issum,		/* is summary not bitmap */
-	xfs_buf_t	**bpp)		/* output: buffer for the block */
-{
-	xfs_buf_t	*bp;		/* block buffer, result */
-	xfs_inode_t	*ip;		/* bitmap or summary inode */
-	xfs_bmbt_irec_t	map;
-	int		nmap = 1;
-	int		error;		/* error value */
-
-	ip = issum ? mp->m_rsumip : mp->m_rbmip;
-
-	error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK);
-	if (error)
-		return error;
-
-	ASSERT(map.br_startblock != NULLFSBLOCK);
-	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-				   XFS_FSB_TO_DADDR(mp, map.br_startblock),
-				   mp->m_bsize, 0, &bp, NULL);
-	if (error)
-		return error;
-	ASSERT(!xfs_buf_geterror(bp));
-	*bpp = bp;
-	return 0;
-}
-
-#ifdef DEBUG
-/*
- * Check that the given extent (block range) is allocated already.
- */
-STATIC int				/* error */
-xfs_rtcheck_alloc_range(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	bno,		/* starting block number of extent */
-	xfs_extlen_t	len,		/* length of extent */
-	int		*stat)		/* out: 1 for allocated, 0 for not */
-{
-	xfs_rtblock_t	new;		/* dummy for xfs_rtcheck_range */
-
-	return xfs_rtcheck_range(mp, tp, bno, len, 0, &new, stat);
-}
-#endif
-
-/*
- * Check that the given range is either all allocated (val = 0) or
- * all free (val = 1).
- */
-STATIC int				/* error */
-xfs_rtcheck_range(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	start,		/* starting block number of extent */
-	xfs_extlen_t	len,		/* length of extent */
-	int		val,		/* 1 for free, 0 for allocated */
-	xfs_rtblock_t	*new,		/* out: first block not matching */
-	int		*stat)		/* out: 1 for matches, 0 for not */
-{
-	xfs_rtword_t	*b;		/* current word in buffer */
-	int		bit;		/* bit number in the word */
-	xfs_rtblock_t	block;		/* bitmap block number */
-	xfs_buf_t	*bp;		/* buf for the block */
-	xfs_rtword_t	*bufp;		/* starting word in buffer */
-	int		error;		/* error value */
-	xfs_rtblock_t	i;		/* current bit number rel. to start */
-	xfs_rtblock_t	lastbit;	/* last useful bit in word */
-	xfs_rtword_t	mask;		/* mask of relevant bits for value */
-	xfs_rtword_t	wdiff;		/* difference from wanted value */
-	int		word;		/* word number in the buffer */
-
-	/*
-	 * Compute starting bitmap block number
-	 */
-	block = XFS_BITTOBLOCK(mp, start);
-	/*
-	 * Read the bitmap block.
-	 */
-	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
-	if (error) {
-		return error;
-	}
-	bufp = bp->b_addr;
-	/*
-	 * Compute the starting word's address, and starting bit.
-	 */
-	word = XFS_BITTOWORD(mp, start);
-	b = &bufp[word];
-	bit = (int)(start & (XFS_NBWORD - 1));
-	/*
-	 * 0 (allocated) => all zero's; 1 (free) => all one's.
-	 */
-	val = -val;
-	/*
-	 * If not starting on a word boundary, deal with the first
-	 * (partial) word.
-	 */
-	if (bit) {
-		/*
-		 * Compute first bit not examined.
-		 */
-		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
-		/*
-		 * Mask of relevant bits.
-		 */
-		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
-		/*
-		 * Compute difference between actual and desired value.
-		 */
-		if ((wdiff = (*b ^ val) & mask)) {
-			/*
-			 * Different, compute first wrong bit and return.
-			 */
-			xfs_trans_brelse(tp, bp);
-			i = XFS_RTLOBIT(wdiff) - bit;
-			*new = start + i;
-			*stat = 0;
-			return 0;
-		}
-		i = lastbit - bit;
-		/*
-		 * Go on to next block if that's where the next word is
-		 * and we need the next word.
-		 */
-		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
-			/*
-			 * If done with this block, get the next one.
-			 */
-			xfs_trans_brelse(tp, bp);
-			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
-			if (error) {
-				return error;
-			}
-			b = bufp = bp->b_addr;
-			word = 0;
-		} else {
-			/*
-			 * Go on to the next word in the buffer.
-			 */
-			b++;
-		}
-	} else {
-		/*
-		 * Starting on a word boundary, no partial word.
-		 */
-		i = 0;
-	}
-	/*
-	 * Loop over whole words in buffers.  When we use up one buffer
-	 * we move on to the next one.
-	 */
-	while (len - i >= XFS_NBWORD) {
-		/*
-		 * Compute difference between actual and desired value.
-		 */
-		if ((wdiff = *b ^ val)) {
-			/*
-			 * Different, compute first wrong bit and return.
-			 */
-			xfs_trans_brelse(tp, bp);
-			i += XFS_RTLOBIT(wdiff);
-			*new = start + i;
-			*stat = 0;
-			return 0;
-		}
-		i += XFS_NBWORD;
-		/*
-		 * Go on to next block if that's where the next word is
-		 * and we need the next word.
-		 */
-		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
-			/*
-			 * If done with this block, get the next one.
-			 */
-			xfs_trans_brelse(tp, bp);
-			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
-			if (error) {
-				return error;
-			}
-			b = bufp = bp->b_addr;
-			word = 0;
-		} else {
-			/*
-			 * Go on to the next word in the buffer.
-			 */
-			b++;
-		}
-	}
-	/*
-	 * If not ending on a word boundary, deal with the last
-	 * (partial) word.
-	 */
-	if ((lastbit = len - i)) {
-		/*
-		 * Mask of relevant bits.
-		 */
-		mask = ((xfs_rtword_t)1 << lastbit) - 1;
-		/*
-		 * Compute difference between actual and desired value.
-		 */
-		if ((wdiff = (*b ^ val) & mask)) {
-			/*
-			 * Different, compute first wrong bit and return.
-			 */
-			xfs_trans_brelse(tp, bp);
-			i += XFS_RTLOBIT(wdiff);
-			*new = start + i;
-			*stat = 0;
-			return 0;
-		} else
-			i = len;
-	}
-	/*
-	 * Successful, return.
-	 */
-	xfs_trans_brelse(tp, bp);
-	*new = start + i;
-	*stat = 1;
-	return 0;
-}
-
-/*
- * Copy and transform the summary file, given the old and new
- * parameters in the mount structures.
- */
-STATIC int				/* error */
-xfs_rtcopy_summary(
-	xfs_mount_t	*omp,		/* old file system mount point */
-	xfs_mount_t	*nmp,		/* new file system mount point */
-	xfs_trans_t	*tp)		/* transaction pointer */
-{
-	xfs_rtblock_t	bbno;		/* bitmap block number */
-	xfs_buf_t	*bp;		/* summary buffer */
-	int		error;		/* error return value */
-	int		log;		/* summary level number (log length) */
-	xfs_suminfo_t	sum;		/* summary data */
-	xfs_fsblock_t	sumbno;		/* summary block number */
-
-	bp = NULL;
-	for (log = omp->m_rsumlevels - 1; log >= 0; log--) {
-		for (bbno = omp->m_sb.sb_rbmblocks - 1;
-		     (xfs_srtblock_t)bbno >= 0;
-		     bbno--) {
-			error = xfs_rtget_summary(omp, tp, log, bbno, &bp,
-				&sumbno, &sum);
-			if (error)
-				return error;
-			if (sum == 0)
-				continue;
-			error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum,
-				&bp, &sumbno);
-			if (error)
-				return error;
-			error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum,
-				&bp, &sumbno);
-			if (error)
-				return error;
-			ASSERT(sum > 0);
-		}
-	}
-	return 0;
-}
-
-/*
- * Searching backward from start to limit, find the first block whose
- * allocated/free state is different from start's.
- */
-STATIC int				/* error */
-xfs_rtfind_back(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	start,		/* starting block to look at */
-	xfs_rtblock_t	limit,		/* last block to look at */
-	xfs_rtblock_t	*rtblock)	/* out: start block found */
-{
-	xfs_rtword_t	*b;		/* current word in buffer */
-	int		bit;		/* bit number in the word */
-	xfs_rtblock_t	block;		/* bitmap block number */
-	xfs_buf_t	*bp;		/* buf for the block */
-	xfs_rtword_t	*bufp;		/* starting word in buffer */
-	int		error;		/* error value */
-	xfs_rtblock_t	firstbit;	/* first useful bit in the word */
-	xfs_rtblock_t	i;		/* current bit number rel. to start */
-	xfs_rtblock_t	len;		/* length of inspected area */
-	xfs_rtword_t	mask;		/* mask of relevant bits for value */
-	xfs_rtword_t	want;		/* mask for "good" values */
-	xfs_rtword_t	wdiff;		/* difference from wanted value */
-	int		word;		/* word number in the buffer */
-
-	/*
-	 * Compute and read in starting bitmap block for starting block.
-	 */
-	block = XFS_BITTOBLOCK(mp, start);
-	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
-	if (error) {
-		return error;
-	}
-	bufp = bp->b_addr;
-	/*
-	 * Get the first word's index & point to it.
-	 */
-	word = XFS_BITTOWORD(mp, start);
-	b = &bufp[word];
-	bit = (int)(start & (XFS_NBWORD - 1));
-	len = start - limit + 1;
-	/*
-	 * Compute match value, based on the bit at start: if 1 (free)
-	 * then all-ones, else all-zeroes.
-	 */
-	want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
-	/*
-	 * If the starting position is not word-aligned, deal with the
-	 * partial word.
-	 */
-	if (bit < XFS_NBWORD - 1) {
-		/*
-		 * Calculate first (leftmost) bit number to look at,
-		 * and mask for all the relevant bits in this word.
-		 */
-		firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
-		mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
-			firstbit;
-		/*
-		 * Calculate the difference between the value there
-		 * and what we're looking for.
-		 */
-		if ((wdiff = (*b ^ want) & mask)) {
-			/*
-			 * Different.  Mark where we are and return.
-			 */
-			xfs_trans_brelse(tp, bp);
-			i = bit - XFS_RTHIBIT(wdiff);
-			*rtblock = start - i + 1;
-			return 0;
-		}
-		i = bit - firstbit + 1;
-		/*
-		 * Go on to previous block if that's where the previous word is
-		 * and we need the previous word.
-		 */
-		if (--word == -1 && i < len) {
-			/*
-			 * If done with this block, get the previous one.
-			 */
-			xfs_trans_brelse(tp, bp);
-			error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
-			if (error) {
-				return error;
-			}
-			bufp = bp->b_addr;
-			word = XFS_BLOCKWMASK(mp);
-			b = &bufp[word];
-		} else {
-			/*
-			 * Go on to the previous word in the buffer.
-			 */
-			b--;
-		}
-	} else {
-		/*
-		 * Starting on a word boundary, no partial word.
-		 */
-		i = 0;
-	}
-	/*
-	 * Loop over whole words in buffers.  When we use up one buffer
-	 * we move on to the previous one.
-	 */
-	while (len - i >= XFS_NBWORD) {
-		/*
-		 * Compute difference between actual and desired value.
-		 */
-		if ((wdiff = *b ^ want)) {
-			/*
-			 * Different, mark where we are and return.
-			 */
-			xfs_trans_brelse(tp, bp);
-			i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
-			*rtblock = start - i + 1;
-			return 0;
-		}
-		i += XFS_NBWORD;
-		/*
-		 * Go on to previous block if that's where the previous word is
-		 * and we need the previous word.
-		 */
-		if (--word == -1 && i < len) {
-			/*
-			 * If done with this block, get the previous one.
-			 */
-			xfs_trans_brelse(tp, bp);
-			error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
-			if (error) {
-				return error;
-			}
-			bufp = bp->b_addr;
-			word = XFS_BLOCKWMASK(mp);
-			b = &bufp[word];
-		} else {
-			/*
-			 * Go on to the previous word in the buffer.
-			 */
-			b--;
-		}
-	}
-	/*
-	 * If not ending on a word boundary, deal with the last
-	 * (partial) word.
-	 */
-	if (len - i) {
-		/*
-		 * Calculate first (leftmost) bit number to look at,
-		 * and mask for all the relevant bits in this word.
-		 */
-		firstbit = XFS_NBWORD - (len - i);
-		mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit;
-		/*
-		 * Compute difference between actual and desired value.
-		 */
-		if ((wdiff = (*b ^ want) & mask)) {
-			/*
-			 * Different, mark where we are and return.
-			 */
-			xfs_trans_brelse(tp, bp);
-			i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
-			*rtblock = start - i + 1;
-			return 0;
-		} else
-			i = len;
-	}
-	/*
-	 * No match, return that we scanned the whole area.
-	 */
-	xfs_trans_brelse(tp, bp);
-	*rtblock = start - i + 1;
-	return 0;
-}
-
-/*
- * Searching forward from start to limit, find the first block whose
- * allocated/free state is different from start's.
- */
-STATIC int				/* error */
-xfs_rtfind_forw(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	start,		/* starting block to look at */
-	xfs_rtblock_t	limit,		/* last block to look at */
-	xfs_rtblock_t	*rtblock)	/* out: start block found */
-{
-	xfs_rtword_t	*b;		/* current word in buffer */
-	int		bit;		/* bit number in the word */
-	xfs_rtblock_t	block;		/* bitmap block number */
-	xfs_buf_t	*bp;		/* buf for the block */
-	xfs_rtword_t	*bufp;		/* starting word in buffer */
-	int		error;		/* error value */
-	xfs_rtblock_t	i;		/* current bit number rel. to start */
-	xfs_rtblock_t	lastbit;	/* last useful bit in the word */
-	xfs_rtblock_t	len;		/* length of inspected area */
-	xfs_rtword_t	mask;		/* mask of relevant bits for value */
-	xfs_rtword_t	want;		/* mask for "good" values */
-	xfs_rtword_t	wdiff;		/* difference from wanted value */
-	int		word;		/* word number in the buffer */
-
-	/*
-	 * Compute and read in starting bitmap block for starting block.
-	 */
-	block = XFS_BITTOBLOCK(mp, start);
-	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
-	if (error) {
-		return error;
-	}
-	bufp = bp->b_addr;
-	/*
-	 * Get the first word's index & point to it.
-	 */
-	word = XFS_BITTOWORD(mp, start);
-	b = &bufp[word];
-	bit = (int)(start & (XFS_NBWORD - 1));
-	len = limit - start + 1;
-	/*
-	 * Compute match value, based on the bit at start: if 1 (free)
-	 * then all-ones, else all-zeroes.
-	 */
-	want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
-	/*
-	 * If the starting position is not word-aligned, deal with the
-	 * partial word.
-	 */
-	if (bit) {
-		/*
-		 * Calculate last (rightmost) bit number to look at,
-		 * and mask for all the relevant bits in this word.
-		 */
-		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
-		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
-		/*
-		 * Calculate the difference between the value there
-		 * and what we're looking for.
-		 */
-		if ((wdiff = (*b ^ want) & mask)) {
-			/*
-			 * Different.  Mark where we are and return.
-			 */
-			xfs_trans_brelse(tp, bp);
-			i = XFS_RTLOBIT(wdiff) - bit;
-			*rtblock = start + i - 1;
-			return 0;
-		}
-		i = lastbit - bit;
-		/*
-		 * Go on to next block if that's where the next word is
-		 * and we need the next word.
-		 */
-		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
-			/*
-			 * If done with this block, get the previous one.
-			 */
-			xfs_trans_brelse(tp, bp);
-			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
-			if (error) {
-				return error;
-			}
-			b = bufp = bp->b_addr;
-			word = 0;
-		} else {
-			/*
-			 * Go on to the previous word in the buffer.
-			 */
-			b++;
-		}
-	} else {
-		/*
-		 * Starting on a word boundary, no partial word.
-		 */
-		i = 0;
-	}
-	/*
-	 * Loop over whole words in buffers.  When we use up one buffer
-	 * we move on to the next one.
-	 */
-	while (len - i >= XFS_NBWORD) {
-		/*
-		 * Compute difference between actual and desired value.
-		 */
-		if ((wdiff = *b ^ want)) {
-			/*
-			 * Different, mark where we are and return.
-			 */
-			xfs_trans_brelse(tp, bp);
-			i += XFS_RTLOBIT(wdiff);
-			*rtblock = start + i - 1;
-			return 0;
-		}
-		i += XFS_NBWORD;
-		/*
-		 * Go on to next block if that's where the next word is
-		 * and we need the next word.
-		 */
-		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
-			/*
-			 * If done with this block, get the next one.
-			 */
-			xfs_trans_brelse(tp, bp);
-			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
-			if (error) {
-				return error;
-			}
-			b = bufp = bp->b_addr;
-			word = 0;
-		} else {
-			/*
-			 * Go on to the next word in the buffer.
-			 */
-			b++;
-		}
-	}
-	/*
-	 * If not ending on a word boundary, deal with the last
-	 * (partial) word.
-	 */
-	if ((lastbit = len - i)) {
-		/*
-		 * Calculate mask for all the relevant bits in this word.
-		 */
-		mask = ((xfs_rtword_t)1 << lastbit) - 1;
-		/*
-		 * Compute difference between actual and desired value.
-		 */
-		if ((wdiff = (*b ^ want) & mask)) {
-			/*
-			 * Different, mark where we are and return.
-			 */
-			xfs_trans_brelse(tp, bp);
-			i += XFS_RTLOBIT(wdiff);
-			*rtblock = start + i - 1;
-			return 0;
-		} else
-			i = len;
-	}
-	/*
-	 * No match, return that we scanned the whole area.
-	 */
-	xfs_trans_brelse(tp, bp);
-	*rtblock = start + i - 1;
-	return 0;
-}
-
-/*
- * Mark an extent specified by start and len freed.
- * Updates all the summary information as well as the bitmap.
- */
-STATIC int				/* error */
-xfs_rtfree_range(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	start,		/* starting block to free */
-	xfs_extlen_t	len,		/* length to free */
-	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
-	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
-{
-	xfs_rtblock_t	end;		/* end of the freed extent */
-	int		error;		/* error value */
-	xfs_rtblock_t	postblock;	/* first block freed > end */
-	xfs_rtblock_t	preblock;	/* first block freed < start */
-
-	end = start + len - 1;
-	/*
-	 * Modify the bitmap to mark this extent freed.
-	 */
-	error = xfs_rtmodify_range(mp, tp, start, len, 1);
-	if (error) {
-		return error;
-	}
-	/*
-	 * Assume we're freeing out of the middle of an allocated extent.
-	 * We need to find the beginning and end of the extent so we can
-	 * properly update the summary.
-	 */
-	error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
-	if (error) {
-		return error;
-	}
-	/*
-	 * Find the next allocated block (end of allocated extent).
-	 */
-	error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
-		&postblock);
-	if (error)
-		return error;
-	/*
-	 * If there are blocks not being freed at the front of the
-	 * old extent, add summary data for them to be allocated.
-	 */
-	if (preblock < start) {
-		error = xfs_rtmodify_summary(mp, tp,
-			XFS_RTBLOCKLOG(start - preblock),
-			XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
-		if (error) {
-			return error;
-		}
-	}
-	/*
-	 * If there are blocks not being freed at the end of the
-	 * old extent, add summary data for them to be allocated.
-	 */
-	if (postblock > end) {
-		error = xfs_rtmodify_summary(mp, tp,
-			XFS_RTBLOCKLOG(postblock - end),
-			XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
-		if (error) {
-			return error;
-		}
-	}
-	/*
-	 * Increment the summary information corresponding to the entire
-	 * (new) free extent.
-	 */
-	error = xfs_rtmodify_summary(mp, tp,
-		XFS_RTBLOCKLOG(postblock + 1 - preblock),
-		XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
-	return error;
-}
-
-/*
- * Read and return the summary information for a given extent size,
- * bitmap block combination.
- * Keeps track of a current summary block, so we don't keep reading
- * it from the buffer cache.
- */
-STATIC int				/* error */
-xfs_rtget_summary(
-	xfs_mount_t	*mp,		/* file system mount structure */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	int		log,		/* log2 of extent size */
-	xfs_rtblock_t	bbno,		/* bitmap block number */
-	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
-	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
-	xfs_suminfo_t	*sum)		/* out: summary info for this block */
-{
-	xfs_buf_t	*bp;		/* buffer for summary block */
-	int		error;		/* error value */
-	xfs_fsblock_t	sb;		/* summary fsblock */
-	int		so;		/* index into the summary file */
-	xfs_suminfo_t	*sp;		/* pointer to returned data */
-
-	/*
-	 * Compute entry number in the summary file.
-	 */
-	so = XFS_SUMOFFS(mp, log, bbno);
-	/*
-	 * Compute the block number in the summary file.
-	 */
-	sb = XFS_SUMOFFSTOBLOCK(mp, so);
-	/*
-	 * If we have an old buffer, and the block number matches, use that.
-	 */
-	if (rbpp && *rbpp && *rsb == sb)
-		bp = *rbpp;
-	/*
-	 * Otherwise we have to get the buffer.
-	 */
-	else {
-		/*
-		 * If there was an old one, get rid of it first.
-		 */
-		if (rbpp && *rbpp)
-			xfs_trans_brelse(tp, *rbpp);
-		error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
-		if (error) {
-			return error;
-		}
-		/*
-		 * Remember this buffer and block for the next call.
-		 */
-		if (rbpp) {
-			*rbpp = bp;
-			*rsb = sb;
-		}
-	}
-	/*
-	 * Point to the summary information & copy it out.
-	 */
-	sp = XFS_SUMPTR(mp, bp, so);
-	*sum = *sp;
-	/*
-	 * Drop the buffer if we're not asked to remember it.
-	 */
-	if (!rbpp)
-		xfs_trans_brelse(tp, bp);
-	return 0;
-}
-
-/*
- * Set the given range of bitmap bits to the given value.
- * Do whatever I/O and logging is required.
- */
-STATIC int				/* error */
-xfs_rtmodify_range(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	start,		/* starting block to modify */
-	xfs_extlen_t	len,		/* length of extent to modify */
-	int		val)		/* 1 for free, 0 for allocated */
-{
-	xfs_rtword_t	*b;		/* current word in buffer */
-	int		bit;		/* bit number in the word */
-	xfs_rtblock_t	block;		/* bitmap block number */
-	xfs_buf_t	*bp;		/* buf for the block */
-	xfs_rtword_t	*bufp;		/* starting word in buffer */
-	int		error;		/* error value */
-	xfs_rtword_t	*first;		/* first used word in the buffer */
-	int		i;		/* current bit number rel. to start */
-	int		lastbit;	/* last useful bit in word */
-	xfs_rtword_t	mask;		/* mask o frelevant bits for value */
-	int		word;		/* word number in the buffer */
-
-	/*
-	 * Compute starting bitmap block number.
-	 */
-	block = XFS_BITTOBLOCK(mp, start);
-	/*
-	 * Read the bitmap block, and point to its data.
-	 */
-	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
-	if (error) {
-		return error;
+				return 0;
+			}
+			/*
+			 * If the "next block to try" returned from the
+			 * allocator is beyond the next bitmap block,
+			 * skip to that bitmap block.
+			 */
+			if (XFS_BITTOBLOCK(mp, n) > i + 1)
+				i = XFS_BITTOBLOCK(mp, n) - 1;
+		}
 	}
-	bufp = bp->b_addr;
-	/*
-	 * Compute the starting word's address, and starting bit.
-	 */
-	word = XFS_BITTOWORD(mp, start);
-	first = b = &bufp[word];
-	bit = (int)(start & (XFS_NBWORD - 1));
 	/*
-	 * 0 (allocated) => all zeroes; 1 (free) => all ones.
+	 * Didn't find any maxlen blocks.  Try smaller ones, unless
+	 * we're asking for a fixed size extent.
 	 */
-	val = -val;
+	if (minlen > --maxlen) {
+		*rtblock = NULLRTBLOCK;
+		return 0;
+	}
+	ASSERT(minlen != 0);
+	ASSERT(maxlen != 0);
+
 	/*
-	 * If not starting on a word boundary, deal with the first
-	 * (partial) word.
+	 * Loop over sizes, from maxlen down to minlen.
+	 * This time, when we do the allocations, allow smaller ones
+	 * to succeed.
 	 */
-	if (bit) {
-		/*
-		 * Compute first bit not changed and mask of relevant bits.
-		 */
-		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
-		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
-		/*
-		 * Set/clear the active bits.
-		 */
-		if (val)
-			*b |= mask;
-		else
-			*b &= ~mask;
-		i = lastbit - bit;
+	for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) {
 		/*
-		 * Go on to the next block if that's where the next word is
-		 * and we need the next word.
+		 * Loop over all the bitmap blocks, try an allocation
+		 * starting in that block.
 		 */
-		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+		for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
 			/*
-			 * Log the changed part of this block.
-			 * Get the next one.
+			 * Get the summary information for this level/block.
 			 */
-			xfs_trans_log_buf(tp, bp,
-				(uint)((char *)first - (char *)bufp),
-				(uint)((char *)b - (char *)bufp));
-			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			error =	xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
+						  &sum);
 			if (error) {
 				return error;
 			}
-			first = b = bufp = bp->b_addr;
-			word = 0;
-		} else {
 			/*
-			 * Go on to the next word in the buffer
+			 * If nothing there, go on to next.
 			 */
-			b++;
-		}
-	} else {
-		/*
-		 * Starting on a word boundary, no partial word.
-		 */
-		i = 0;
-	}
-	/*
-	 * Loop over whole words in buffers.  When we use up one buffer
-	 * we move on to the next one.
-	 */
-	while (len - i >= XFS_NBWORD) {
-		/*
-		 * Set the word value correctly.
-		 */
-		*b = val;
-		i += XFS_NBWORD;
-		/*
-		 * Go on to the next block if that's where the next word is
-		 * and we need the next word.
-		 */
-		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			if (!sum)
+				continue;
 			/*
-			 * Log the changed part of this block.
-			 * Get the next one.
+			 * Try the allocation.  Make sure the specified
+			 * minlen/maxlen are in the possible range for
+			 * this summary level.
 			 */
-			xfs_trans_log_buf(tp, bp,
-				(uint)((char *)first - (char *)bufp),
-				(uint)((char *)b - (char *)bufp));
-			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			error = xfs_rtallocate_extent_block(mp, tp, i,
+					XFS_RTMAX(minlen, 1 << l),
+					XFS_RTMIN(maxlen, (1 << (l + 1)) - 1),
+					len, &n, rbpp, rsb, prod, &r);
 			if (error) {
 				return error;
 			}
-			first = b = bufp = bp->b_addr;
-			word = 0;
-		} else {
 			/*
-			 * Go on to the next word in the buffer
+			 * If it worked, return that extent.
+			 */
+			if (r != NULLRTBLOCK) {
+				*rtblock = r;
+				return 0;
+			}
+			/*
+			 * If the "next block to try" returned from the
+			 * allocator is beyond the next bitmap block,
+			 * skip to that bitmap block.
 			 */
-			b++;
+			if (XFS_BITTOBLOCK(mp, n) > i + 1)
+				i = XFS_BITTOBLOCK(mp, n) - 1;
 		}
 	}
 	/*
-	 * If not ending on a word boundary, deal with the last
-	 * (partial) word.
-	 */
-	if ((lastbit = len - i)) {
-		/*
-		 * Compute a mask of relevant bits.
-		 */
-		bit = 0;
-		mask = ((xfs_rtword_t)1 << lastbit) - 1;
-		/*
-		 * Set/clear the active bits.
-		 */
-		if (val)
-			*b |= mask;
-		else
-			*b &= ~mask;
-		b++;
-	}
-	/*
-	 * Log any remaining changed bytes.
+	 * Got nothing, return failure.
 	 */
-	if (b > first)
-		xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
-			(uint)((char *)b - (char *)bufp - 1));
+	*rtblock = NULLRTBLOCK;
 	return 0;
 }
 
 /*
- * Read and modify the summary information for a given extent size,
- * bitmap block combination.
- * Keeps track of a current summary block, so we don't keep reading
- * it from the buffer cache.
+ * Allocate space to the bitmap or summary file, and zero it, for growfs.
  */
 STATIC int				/* error */
-xfs_rtmodify_summary(
+xfs_growfs_rt_alloc(
 	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	int		log,		/* log2 of extent size */
-	xfs_rtblock_t	bbno,		/* bitmap block number */
-	int		delta,		/* change to make to summary info */
-	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
-	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+	xfs_extlen_t	oblocks,	/* old count of blocks */
+	xfs_extlen_t	nblocks,	/* new count of blocks */
+	xfs_inode_t	*ip)		/* inode (bitmap/summary) */
 {
-	xfs_buf_t	*bp;		/* buffer for the summary block */
-	int		error;		/* error value */
-	xfs_fsblock_t	sb;		/* summary fsblock */
-	int		so;		/* index into the summary file */
-	xfs_suminfo_t	*sp;		/* pointer to returned data */
+	xfs_fileoff_t	bno;		/* block number in file */
+	xfs_buf_t	*bp;		/* temporary buffer for zeroing */
+	int		committed;	/* transaction committed flag */
+	xfs_daddr_t	d;		/* disk block address */
+	int		error;		/* error return value */
+	xfs_fsblock_t	firstblock;	/* first block allocated in xaction */
+	xfs_bmap_free_t	flist;		/* list of freed blocks */
+	xfs_fsblock_t	fsbno;		/* filesystem block for bno */
+	xfs_bmbt_irec_t	map;		/* block map output */
+	int		nmap;		/* number of block maps */
+	int		resblks;	/* space reservation */
 
 	/*
-	 * Compute entry number in the summary file.
-	 */
-	so = XFS_SUMOFFS(mp, log, bbno);
-	/*
-	 * Compute the block number in the summary file.
-	 */
-	sb = XFS_SUMOFFSTOBLOCK(mp, so);
-	/*
-	 * If we have an old buffer, and the block number matches, use that.
-	 */
-	if (rbpp && *rbpp && *rsb == sb)
-		bp = *rbpp;
-	/*
-	 * Otherwise we have to get the buffer.
+	 * Allocate space to the file, as necessary.
 	 */
-	else {
+	while (oblocks < nblocks) {
+		int		cancelflags = 0;
+		xfs_trans_t	*tp;
+
+		tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
+		resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
 		/*
-		 * If there was an old one, get rid of it first.
+		 * Reserve space & log for one extent added to the file.
 		 */
-		if (rbpp && *rbpp)
-			xfs_trans_brelse(tp, *rbpp);
-		error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
-		if (error) {
-			return error;
-		}
+		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
+					  resblks, 0);
+		if (error)
+			goto error_cancel;
+		cancelflags = XFS_TRANS_RELEASE_LOG_RES;
 		/*
-		 * Remember this buffer and block for the next call.
+		 * Lock the inode.
 		 */
-		if (rbpp) {
-			*rbpp = bp;
-			*rsb = sb;
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+		xfs_bmap_init(&flist, &firstblock);
+		/*
+		 * Allocate blocks to the bitmap file.
+		 */
+		nmap = 1;
+		cancelflags |= XFS_TRANS_ABORT;
+		error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
+					XFS_BMAPI_METADATA, &firstblock,
+					resblks, &map, &nmap, &flist);
+		if (!error && nmap < 1)
+			error = XFS_ERROR(ENOSPC);
+		if (error)
+			goto error_cancel;
+		/*
+		 * Free any blocks freed up in the transaction, then commit.
+		 */
+		error = xfs_bmap_finish(&tp, &flist, &committed);
+		if (error)
+			goto error_cancel;
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		if (error)
+			goto error;
+		/*
+		 * Now we need to clear the allocated blocks.
+		 * Do this one block per transaction, to keep it simple.
+		 */
+		cancelflags = 0;
+		for (bno = map.br_startoff, fsbno = map.br_startblock;
+		     bno < map.br_startoff + map.br_blockcount;
+		     bno++, fsbno++) {
+			tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO);
+			/*
+			 * Reserve log for one block zeroing.
+			 */
+			error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero,
+						  0, 0);
+			if (error)
+				goto error_cancel;
+			/*
+			 * Lock the bitmap inode.
+			 */
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+			xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+			/*
+			 * Get a buffer for the block.
+			 */
+			d = XFS_FSB_TO_DADDR(mp, fsbno);
+			bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+				mp->m_bsize, 0);
+			if (bp == NULL) {
+				error = XFS_ERROR(EIO);
+error_cancel:
+				xfs_trans_cancel(tp, cancelflags);
+				goto error;
+			}
+			memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
+			xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
+			/*
+			 * Commit the transaction.
+			 */
+			error = xfs_trans_commit(tp, 0);
+			if (error)
+				goto error;
 		}
+		/*
+		 * Go on to the next extent, if any.
+		 */
+		oblocks = map.br_startoff + map.br_blockcount;
 	}
-	/*
-	 * Point to the summary information, modify and log it.
-	 */
-	sp = XFS_SUMPTR(mp, bp, so);
-	*sp += delta;
-	xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr),
-		(uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1));
 	return 0;
+
+error:
+	return error;
 }
 
 /*
@@ -2127,66 +1230,6 @@ xfs_rtallocate_extent(
 	return 0;
 }
 
-/*
- * Free an extent in the realtime subvolume.  Length is expressed in
- * realtime extents, as is the block number.
- */
-int					/* error */
-xfs_rtfree_extent(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	bno,		/* starting block number to free */
-	xfs_extlen_t	len)		/* length of extent freed */
-{
-	int		error;		/* error value */
-	xfs_mount_t	*mp;		/* file system mount structure */
-	xfs_fsblock_t	sb;		/* summary file block number */
-	xfs_buf_t	*sumbp;		/* summary file block buffer */
-
-	mp = tp->t_mountp;
-
-	ASSERT(mp->m_rbmip->i_itemp != NULL);
-	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
-
-#ifdef DEBUG
-	/*
-	 * Check to see that this whole range is currently allocated.
-	 */
-	{
-		int	stat;		/* result from checking range */
-
-		error = xfs_rtcheck_alloc_range(mp, tp, bno, len, &stat);
-		if (error) {
-			return error;
-		}
-		ASSERT(stat);
-	}
-#endif
-	sumbp = NULL;
-	/*
-	 * Free the range of realtime blocks.
-	 */
-	error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
-	if (error) {
-		return error;
-	}
-	/*
-	 * Mark more blocks free in the superblock.
-	 */
-	xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
-	/*
-	 * If we've now freed all the blocks, reset the file sequence
-	 * number to 0.
-	 */
-	if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
-	    mp->m_sb.sb_rextents) {
-		if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
-			mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
-		*(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
-		xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
-	}
-	return 0;
-}
-
 /*
  * Initialize realtime fields in the mount structure.
  */
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index b2a1a24c0e2f..752b63d10300 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -95,6 +95,30 @@ xfs_growfs_rt(
 	struct xfs_mount	*mp,	/* file system mount structure */
 	xfs_growfs_rt_t		*in);	/* user supplied growfs struct */
 
+/*
+ * From xfs_rtbitmap.c
+ */
+int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp,
+		  xfs_rtblock_t block, int issum, struct xfs_buf **bpp);
+int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp,
+		      xfs_rtblock_t start, xfs_extlen_t len, int val,
+		      xfs_rtblock_t *new, int *stat);
+int xfs_rtfind_back(struct xfs_mount *mp, struct xfs_trans *tp,
+		    xfs_rtblock_t start, xfs_rtblock_t limit,
+		    xfs_rtblock_t *rtblock);
+int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp,
+		    xfs_rtblock_t start, xfs_rtblock_t limit,
+		    xfs_rtblock_t *rtblock);
+int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp,
+		       xfs_rtblock_t start, xfs_extlen_t len, int val);
+int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
+			 xfs_rtblock_t bbno, int delta, xfs_buf_t **rbpp,
+			 xfs_fsblock_t *rsb);
+int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
+		     xfs_rtblock_t start, xfs_extlen_t len,
+		     struct xfs_buf **rbpp, xfs_fsblock_t *rsb);
+
+
 #else
 # define xfs_rtallocate_extent(t,b,min,max,l,a,f,p,rb)  (ENOSYS)
 # define xfs_rtfree_extent(t,b,l)                       (ENOSYS)
diff --git a/fs/xfs/xfs_rtbitmap.c b/fs/xfs/xfs_rtbitmap.c
new file mode 100644
index 000000000000..e30efe8be5e6
--- /dev/null
+++ b/fs/xfs/xfs_rtbitmap.c
@@ -0,0 +1,973 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_buf.h"
+#include "xfs_icache.h"
+#include "xfs_dinode.h"
+
+
+/*
+ * Realtime allocator bitmap functions shared with userspace.
+ */
+
+/*
+ * Get a buffer for the bitmap or summary file block specified.
+ * The buffer is returned read and locked.
+ */
+int
+xfs_rtbuf_get(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	block,		/* block number in bitmap or summary */
+	int		issum,		/* is summary not bitmap */
+	xfs_buf_t	**bpp)		/* output: buffer for the block */
+{
+	xfs_buf_t	*bp;		/* block buffer, result */
+	xfs_inode_t	*ip;		/* bitmap or summary inode */
+	xfs_bmbt_irec_t	map;
+	int		nmap = 1;
+	int		error;		/* error value */
+
+	ip = issum ? mp->m_rsumip : mp->m_rbmip;
+
+	error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK);
+	if (error)
+		return error;
+
+	ASSERT(map.br_startblock != NULLFSBLOCK);
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+				   XFS_FSB_TO_DADDR(mp, map.br_startblock),
+				   mp->m_bsize, 0, &bp, NULL);
+	if (error)
+		return error;
+	ASSERT(!xfs_buf_geterror(bp));
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Searching backward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+int
+xfs_rtfind_back(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to look at */
+	xfs_rtblock_t	limit,		/* last block to look at */
+	xfs_rtblock_t	*rtblock)	/* out: start block found */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtblock_t	firstbit;	/* first useful bit in the word */
+	xfs_rtblock_t	i;		/* current bit number rel. to start */
+	xfs_rtblock_t	len;		/* length of inspected area */
+	xfs_rtword_t	mask;		/* mask of relevant bits for value */
+	xfs_rtword_t	want;		/* mask for "good" values */
+	xfs_rtword_t	wdiff;		/* difference from wanted value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute and read in starting bitmap block for starting block.
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = bp->b_addr;
+	/*
+	 * Get the first word's index & point to it.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	len = start - limit + 1;
+	/*
+	 * Compute match value, based on the bit at start: if 1 (free)
+	 * then all-ones, else all-zeroes.
+	 */
+	want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+	/*
+	 * If the starting position is not word-aligned, deal with the
+	 * partial word.
+	 */
+	if (bit < XFS_NBWORD - 1) {
+		/*
+		 * Calculate first (leftmost) bit number to look at,
+		 * and mask for all the relevant bits in this word.
+		 */
+		firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
+		mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
+			firstbit;
+		/*
+		 * Calculate the difference between the value there
+		 * and what we're looking for.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different.  Mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i = bit - XFS_RTHIBIT(wdiff);
+			*rtblock = start - i + 1;
+			return 0;
+		}
+		i = bit - firstbit + 1;
+		/*
+		 * Go on to previous block if that's where the previous word is
+		 * and we need the previous word.
+		 */
+		if (--word == -1 && i < len) {
+			/*
+			 * If done with this block, get the previous one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			bufp = bp->b_addr;
+			word = XFS_BLOCKWMASK(mp);
+			b = &bufp[word];
+		} else {
+			/*
+			 * Go on to the previous word in the buffer.
+			 */
+			b--;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the previous one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = *b ^ want)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+			*rtblock = start - i + 1;
+			return 0;
+		}
+		i += XFS_NBWORD;
+		/*
+		 * Go on to previous block if that's where the previous word is
+		 * and we need the previous word.
+		 */
+		if (--word == -1 && i < len) {
+			/*
+			 * If done with this block, get the previous one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			bufp = bp->b_addr;
+			word = XFS_BLOCKWMASK(mp);
+			b = &bufp[word];
+		} else {
+			/*
+			 * Go on to the previous word in the buffer.
+			 */
+			b--;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if (len - i) {
+		/*
+		 * Calculate first (leftmost) bit number to look at,
+		 * and mask for all the relevant bits in this word.
+		 */
+		firstbit = XFS_NBWORD - (len - i);
+		mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+			*rtblock = start - i + 1;
+			return 0;
+		} else
+			i = len;
+	}
+	/*
+	 * No match, return that we scanned the whole area.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*rtblock = start - i + 1;
+	return 0;
+}
+
+/*
+ * Searching forward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+int
+xfs_rtfind_forw(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to look at */
+	xfs_rtblock_t	limit,		/* last block to look at */
+	xfs_rtblock_t	*rtblock)	/* out: start block found */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtblock_t	i;		/* current bit number rel. to start */
+	xfs_rtblock_t	lastbit;	/* last useful bit in the word */
+	xfs_rtblock_t	len;		/* length of inspected area */
+	xfs_rtword_t	mask;		/* mask of relevant bits for value */
+	xfs_rtword_t	want;		/* mask for "good" values */
+	xfs_rtword_t	wdiff;		/* difference from wanted value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute and read in starting bitmap block for starting block.
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = bp->b_addr;
+	/*
+	 * Get the first word's index & point to it.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	len = limit - start + 1;
+	/*
+	 * Compute match value, based on the bit at start: if 1 (free)
+	 * then all-ones, else all-zeroes.
+	 */
+	want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+	/*
+	 * If the starting position is not word-aligned, deal with the
+	 * partial word.
+	 */
+	if (bit) {
+		/*
+		 * Calculate last (rightmost) bit number to look at,
+		 * and mask for all the relevant bits in this word.
+		 */
+		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+		/*
+		 * Calculate the difference between the value there
+		 * and what we're looking for.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different.  Mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i = XFS_RTLOBIT(wdiff) - bit;
+			*rtblock = start + i - 1;
+			return 0;
+		}
+		i = lastbit - bit;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the previous one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the previous word in the buffer.
+			 */
+			b++;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the next one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = *b ^ want)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*rtblock = start + i - 1;
+			return 0;
+		}
+		i += XFS_NBWORD;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the next one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer.
+			 */
+			b++;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if ((lastbit = len - i)) {
+		/*
+		 * Calculate mask for all the relevant bits in this word.
+		 */
+		mask = ((xfs_rtword_t)1 << lastbit) - 1;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*rtblock = start + i - 1;
+			return 0;
+		} else
+			i = len;
+	}
+	/*
+	 * No match, return that we scanned the whole area.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*rtblock = start + i - 1;
+	return 0;
+}
+
+/*
+ * Read and modify the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
+ */
+int
+xfs_rtmodify_summary(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	int		log,		/* log2 of extent size */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	int		delta,		/* change to make to summary info */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+{
+	xfs_buf_t	*bp;		/* buffer for the summary block */
+	int		error;		/* error value */
+	xfs_fsblock_t	sb;		/* summary fsblock */
+	int		so;		/* index into the summary file */
+	xfs_suminfo_t	*sp;		/* pointer to returned data */
+
+	/*
+	 * Compute entry number in the summary file.
+	 */
+	so = XFS_SUMOFFS(mp, log, bbno);
+	/*
+	 * Compute the block number in the summary file.
+	 */
+	sb = XFS_SUMOFFSTOBLOCK(mp, so);
+	/*
+	 * If we have an old buffer, and the block number matches, use that.
+	 */
+	if (rbpp && *rbpp && *rsb == sb)
+		bp = *rbpp;
+	/*
+	 * Otherwise we have to get the buffer.
+	 */
+	else {
+		/*
+		 * If there was an old one, get rid of it first.
+		 */
+		if (rbpp && *rbpp)
+			xfs_trans_brelse(tp, *rbpp);
+		error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
+		if (error) {
+			return error;
+		}
+		/*
+		 * Remember this buffer and block for the next call.
+		 */
+		if (rbpp) {
+			*rbpp = bp;
+			*rsb = sb;
+		}
+	}
+	/*
+	 * Point to the summary information, modify and log it.
+	 */
+	sp = XFS_SUMPTR(mp, bp, so);
+	*sp += delta;
+	xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr),
+		(uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1));
+	return 0;
+}
+
+/*
+ * Set the given range of bitmap bits to the given value.
+ * Do whatever I/O and logging is required.
+ */
+int
+xfs_rtmodify_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to modify */
+	xfs_extlen_t	len,		/* length of extent to modify */
+	int		val)		/* 1 for free, 0 for allocated */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtword_t	*first;		/* first used word in the buffer */
+	int		i;		/* current bit number rel. to start */
+	int		lastbit;	/* last useful bit in word */
+	xfs_rtword_t	mask;		/* mask o frelevant bits for value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute starting bitmap block number.
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	/*
+	 * Read the bitmap block, and point to its data.
+	 */
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = bp->b_addr;
+	/*
+	 * Compute the starting word's address, and starting bit.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	first = b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	/*
+	 * 0 (allocated) => all zeroes; 1 (free) => all ones.
+	 */
+	val = -val;
+	/*
+	 * If not starting on a word boundary, deal with the first
+	 * (partial) word.
+	 */
+	if (bit) {
+		/*
+		 * Compute first bit not changed and mask of relevant bits.
+		 */
+		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+		/*
+		 * Set/clear the active bits.
+		 */
+		if (val)
+			*b |= mask;
+		else
+			*b &= ~mask;
+		i = lastbit - bit;
+		/*
+		 * Go on to the next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * Log the changed part of this block.
+			 * Get the next one.
+			 */
+			xfs_trans_log_buf(tp, bp,
+				(uint)((char *)first - (char *)bufp),
+				(uint)((char *)b - (char *)bufp));
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			first = b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer
+			 */
+			b++;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the next one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Set the word value correctly.
+		 */
+		*b = val;
+		i += XFS_NBWORD;
+		/*
+		 * Go on to the next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * Log the changed part of this block.
+			 * Get the next one.
+			 */
+			xfs_trans_log_buf(tp, bp,
+				(uint)((char *)first - (char *)bufp),
+				(uint)((char *)b - (char *)bufp));
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			first = b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer
+			 */
+			b++;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if ((lastbit = len - i)) {
+		/*
+		 * Compute a mask of relevant bits.
+		 */
+		bit = 0;
+		mask = ((xfs_rtword_t)1 << lastbit) - 1;
+		/*
+		 * Set/clear the active bits.
+		 */
+		if (val)
+			*b |= mask;
+		else
+			*b &= ~mask;
+		b++;
+	}
+	/*
+	 * Log any remaining changed bytes.
+	 */
+	if (b > first)
+		xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
+			(uint)((char *)b - (char *)bufp - 1));
+	return 0;
+}
+
+/*
+ * Mark an extent specified by start and len freed.
+ * Updates all the summary information as well as the bitmap.
+ */
+int
+xfs_rtfree_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to free */
+	xfs_extlen_t	len,		/* length to free */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+{
+	xfs_rtblock_t	end;		/* end of the freed extent */
+	int		error;		/* error value */
+	xfs_rtblock_t	postblock;	/* first block freed > end */
+	xfs_rtblock_t	preblock;	/* first block freed < start */
+
+	end = start + len - 1;
+	/*
+	 * Modify the bitmap to mark this extent freed.
+	 */
+	error = xfs_rtmodify_range(mp, tp, start, len, 1);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Assume we're freeing out of the middle of an allocated extent.
+	 * We need to find the beginning and end of the extent so we can
+	 * properly update the summary.
+	 */
+	error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Find the next allocated block (end of allocated extent).
+	 */
+	error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+		&postblock);
+	if (error)
+		return error;
+	/*
+	 * If there are blocks not being freed at the front of the
+	 * old extent, add summary data for them to be allocated.
+	 */
+	if (preblock < start) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(start - preblock),
+			XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * If there are blocks not being freed at the end of the
+	 * old extent, add summary data for them to be allocated.
+	 */
+	if (postblock > end) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(postblock - end),
+			XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * Increment the summary information corresponding to the entire
+	 * (new) free extent.
+	 */
+	error = xfs_rtmodify_summary(mp, tp,
+		XFS_RTBLOCKLOG(postblock + 1 - preblock),
+		XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+	return error;
+}
+
+/*
+ * Check that the given range is either all allocated (val = 0) or
+ * all free (val = 1).
+ */
+int
+xfs_rtcheck_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block number of extent */
+	xfs_extlen_t	len,		/* length of extent */
+	int		val,		/* 1 for free, 0 for allocated */
+	xfs_rtblock_t	*new,		/* out: first block not matching */
+	int		*stat)		/* out: 1 for matches, 0 for not */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtblock_t	i;		/* current bit number rel. to start */
+	xfs_rtblock_t	lastbit;	/* last useful bit in word */
+	xfs_rtword_t	mask;		/* mask of relevant bits for value */
+	xfs_rtword_t	wdiff;		/* difference from wanted value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute starting bitmap block number
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	/*
+	 * Read the bitmap block.
+	 */
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = bp->b_addr;
+	/*
+	 * Compute the starting word's address, and starting bit.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	/*
+	 * 0 (allocated) => all zero's; 1 (free) => all one's.
+	 */
+	val = -val;
+	/*
+	 * If not starting on a word boundary, deal with the first
+	 * (partial) word.
+	 */
+	if (bit) {
+		/*
+		 * Compute first bit not examined.
+		 */
+		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		/*
+		 * Mask of relevant bits.
+		 */
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ val) & mask)) {
+			/*
+			 * Different, compute first wrong bit and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i = XFS_RTLOBIT(wdiff) - bit;
+			*new = start + i;
+			*stat = 0;
+			return 0;
+		}
+		i = lastbit - bit;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the next one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer.
+			 */
+			b++;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the next one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = *b ^ val)) {
+			/*
+			 * Different, compute first wrong bit and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*new = start + i;
+			*stat = 0;
+			return 0;
+		}
+		i += XFS_NBWORD;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the next one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = bp->b_addr;
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer.
+			 */
+			b++;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if ((lastbit = len - i)) {
+		/*
+		 * Mask of relevant bits.
+		 */
+		mask = ((xfs_rtword_t)1 << lastbit) - 1;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ val) & mask)) {
+			/*
+			 * Different, compute first wrong bit and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*new = start + i;
+			*stat = 0;
+			return 0;
+		} else
+			i = len;
+	}
+	/*
+	 * Successful, return.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*new = start + i;
+	*stat = 1;
+	return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check that the given extent (block range) is allocated already.
+ */
+STATIC int				/* error */
+xfs_rtcheck_alloc_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number of extent */
+	xfs_extlen_t	len)		/* length of extent */
+{
+	xfs_rtblock_t	new;		/* dummy for xfs_rtcheck_range */
+	int		stat;
+	int		error;
+
+	error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat);
+	if (error)
+		return error;
+	ASSERT(stat);
+	return 0;
+}
+#else
+#define xfs_rtcheck_alloc_range(m,t,b,l)	(0)
+#endif
+/*
+ * Free an extent in the realtime subvolume.  Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int					/* error */
+xfs_rtfree_extent(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number to free */
+	xfs_extlen_t	len)		/* length of extent freed */
+{
+	int		error;		/* error value */
+	xfs_mount_t	*mp;		/* file system mount structure */
+	xfs_fsblock_t	sb;		/* summary file block number */
+	xfs_buf_t	*sumbp = NULL;	/* summary file block buffer */
+
+	mp = tp->t_mountp;
+
+	ASSERT(mp->m_rbmip->i_itemp != NULL);
+	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+
+	error = xfs_rtcheck_alloc_range(mp, tp, bno, len);
+	if (error)
+		return error;
+
+	/*
+	 * Free the range of realtime blocks.
+	 */
+	error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Mark more blocks free in the superblock.
+	 */
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
+	/*
+	 * If we've now freed all the blocks, reset the file sequence
+	 * number to 0.
+	 */
+	if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
+	    mp->m_sb.sb_rextents) {
+		if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
+			mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+		*(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
+		xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+	}
+	return 0;
+}
+
-- 
cgit v1.3


From ceaec15d49f29de1f8864e31ff4007c34b031bff Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 17 Sep 2013 08:07:11 -0400
Subject: 9p: make v9fs_cache_inode_{get,put,set}_cookie empty inlines for
 !9P_CACHEFS

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/cache.h          | 12 ++++++++++++
 fs/9p/vfs_file.c       |  2 --
 fs/9p/vfs_inode.c      |  6 ------
 fs/9p/vfs_inode_dotl.c |  4 ----
 4 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index 40cc54ced5d9..2f9675491095 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h
@@ -101,6 +101,18 @@ static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
 
 #else /* CONFIG_9P_FSCACHE */
 
+static inline void v9fs_cache_inode_get_cookie(struct inode *inode)
+{
+}
+
+static inline void v9fs_cache_inode_put_cookie(struct inode *inode)
+{
+}
+
+static inline void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *file)
+{
+}
+
 static inline int v9fs_fscache_release_page(struct page *page,
 					    gfp_t gfp) {
 	return 1;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index aa5ecf479a57..a0df3e73c2b1 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -105,10 +105,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 		v9inode->writeback_fid = (void *) fid;
 	}
 	mutex_unlock(&v9inode->v_mutex);
-#ifdef CONFIG_9P_FSCACHE
 	if (v9ses->cache)
 		v9fs_cache_inode_set_cookie(inode, file);
-#endif
 	return 0;
 out_error:
 	p9_client_clunk(file->private_data);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 94de6d1482e2..4e65aa903345 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -448,9 +448,7 @@ void v9fs_evict_inode(struct inode *inode)
 	clear_inode(inode);
 	filemap_fdatawrite(inode->i_mapping);
 
-#ifdef CONFIG_9P_FSCACHE
 	v9fs_cache_inode_put_cookie(inode);
-#endif
 	/* clunk the fid stashed in writeback_fid */
 	if (v9inode->writeback_fid) {
 		p9_client_clunk(v9inode->writeback_fid);
@@ -531,9 +529,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
 		goto error;
 
 	v9fs_stat2inode(st, inode, sb);
-#ifdef CONFIG_9P_FSCACHE
 	v9fs_cache_inode_get_cookie(inode);
-#endif
 	unlock_new_inode(inode);
 	return inode;
 error:
@@ -905,10 +901,8 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
 		goto error;
 
 	file->private_data = fid;
-#ifdef CONFIG_9P_FSCACHE
 	if (v9ses->cache)
 		v9fs_cache_inode_set_cookie(dentry->d_inode, file);
-#endif
 
 	*opened |= FILE_CREATED;
 out:
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index a7c481402c46..4c10edec26a0 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -141,9 +141,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
 		goto error;
 
 	v9fs_stat2inode_dotl(st, inode);
-#ifdef CONFIG_9P_FSCACHE
 	v9fs_cache_inode_get_cookie(inode);
-#endif
 	retval = v9fs_get_acl(inode, fid);
 	if (retval)
 		goto error;
@@ -355,10 +353,8 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
 	if (err)
 		goto err_clunk_old_fid;
 	file->private_data = ofid;
-#ifdef CONFIG_9P_FSCACHE
 	if (v9ses->cache)
 		v9fs_cache_inode_set_cookie(inode, file);
-#endif
 	*opened |= FILE_CREATED;
 out:
 	v9fs_put_acl(dacl, pacl);
-- 
cgit v1.3


From 3a93e17cf68b01fa29c7a2e861ce508dcd3401ca Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 15 Sep 2013 19:41:16 -0400
Subject: ecryptfs: check DCACHE_OP_REVALIDATE instead of ->d_op

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ecryptfs/dentry.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index bf12ba5dd223..46e31c92e574 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -51,7 +51,7 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags)
 		return -ECHILD;
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
-	if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
+	if (!(lower_dentry->d_flags & DCACHE_OP_REVALIDATE))
 		goto out;
 	rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags);
 	if (dentry->d_inode) {
-- 
cgit v1.3


From 2edbfbf1c1ab0aeb58ce8abfb69b6f6b63542848 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 15 Sep 2013 20:45:11 -0400
Subject: ecryptfs: don't leave RCU pathwalk immediately

If the underlying dentry doesn't have ->d_revalidate(), there's no need to
force dropping out of RCU mode.  All we need for that is to make freeing
ecryptfs_dentry_info RCU-delayed.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ecryptfs/dentry.c          | 30 ++++++++++++++++--------------
 fs/ecryptfs/ecryptfs_kernel.h |  5 ++++-
 2 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 46e31c92e574..8cefa8e13bcc 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -44,15 +44,15 @@
  */
 static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags)
 {
-	struct dentry *lower_dentry;
-	int rc = 1;
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	int rc;
+
+	if (!(lower_dentry->d_flags & DCACHE_OP_REVALIDATE))
+		return 1;
 
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
 
-	lower_dentry = ecryptfs_dentry_to_lower(dentry);
-	if (!(lower_dentry->d_flags & DCACHE_OP_REVALIDATE))
-		goto out;
 	rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags);
 	if (dentry->d_inode) {
 		struct inode *lower_inode =
@@ -60,12 +60,17 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags)
 
 		fsstack_copy_attr_all(dentry->d_inode, lower_inode);
 	}
-out:
 	return rc;
 }
 
 struct kmem_cache *ecryptfs_dentry_info_cache;
 
+static void ecryptfs_dentry_free_rcu(struct rcu_head *head)
+{
+	kmem_cache_free(ecryptfs_dentry_info_cache,
+		container_of(head, struct ecryptfs_dentry_info, rcu));
+}
+
 /**
  * ecryptfs_d_release
  * @dentry: The ecryptfs dentry
@@ -74,15 +79,12 @@ struct kmem_cache *ecryptfs_dentry_info_cache;
  */
 static void ecryptfs_d_release(struct dentry *dentry)
 {
-	if (ecryptfs_dentry_to_private(dentry)) {
-		if (ecryptfs_dentry_to_lower(dentry)) {
-			dput(ecryptfs_dentry_to_lower(dentry));
-			mntput(ecryptfs_dentry_to_lower_mnt(dentry));
-		}
-		kmem_cache_free(ecryptfs_dentry_info_cache,
-				ecryptfs_dentry_to_private(dentry));
+	struct ecryptfs_dentry_info *p = dentry->d_fsdata;
+	if (p) {
+		if (p->lower_path.dentry)
+			path_put(&p->lower_path);
+		call_rcu(&p->rcu, ecryptfs_dentry_free_rcu);
 	}
-	return;
 }
 
 const struct dentry_operations ecryptfs_dops = {
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index df19d34a033b..e7ad2b3f24a7 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -261,7 +261,10 @@ struct ecryptfs_inode_info {
  * vfsmount too. */
 struct ecryptfs_dentry_info {
 	struct path lower_path;
-	struct ecryptfs_crypt_stat *crypt_stat;
+	union {
+		struct ecryptfs_crypt_stat *crypt_stat;
+		struct rcu_head rcu;
+	};
 };
 
 /**
-- 
cgit v1.3


From 92dd123033d50051d1352fada1bad91c22a119cf Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 15 Sep 2013 20:50:13 -0400
Subject: ecryptfs: get rid of ecryptfs_set_dentry_lower{,_mnt}

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ecryptfs/ecryptfs_kernel.h | 14 --------------
 fs/ecryptfs/inode.c           |  4 ++--
 fs/ecryptfs/main.c            |  3 +--
 3 files changed, 3 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index e7ad2b3f24a7..90d1882b306f 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -515,13 +515,6 @@ ecryptfs_dentry_to_lower(struct dentry *dentry)
 	return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry;
 }
 
-static inline void
-ecryptfs_set_dentry_lower(struct dentry *dentry, struct dentry *lower_dentry)
-{
-	((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry =
-		lower_dentry;
-}
-
 static inline struct vfsmount *
 ecryptfs_dentry_to_lower_mnt(struct dentry *dentry)
 {
@@ -534,13 +527,6 @@ ecryptfs_dentry_to_lower_path(struct dentry *dentry)
 	return &((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path;
 }
 
-static inline void
-ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
-{
-	((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.mnt =
-		lower_mnt;
-}
-
 #define ecryptfs_printk(type, fmt, arg...) \
         __ecryptfs_printk(type "%s: " fmt, __func__, ## arg);
 __printf(1, 2)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 67e9b6339691..2ca62c40c6c2 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -361,8 +361,8 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
 	BUG_ON(!d_count(lower_dentry));
 
 	ecryptfs_set_dentry_private(dentry, dentry_info);
-	ecryptfs_set_dentry_lower(dentry, lower_dentry);
-	ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt);
+	dentry_info->lower_path.mnt = lower_mnt;
+	dentry_info->lower_path.dentry = lower_dentry;
 
 	if (!lower_dentry->d_inode) {
 		/* We want to add because we couldn't find in lower */
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index eb1c5979ecaf..1b119d3bf924 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -585,8 +585,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 
 	/* ->kill_sb() will take care of root_info */
 	ecryptfs_set_dentry_private(s->s_root, root_info);
-	ecryptfs_set_dentry_lower(s->s_root, path.dentry);
-	ecryptfs_set_dentry_lower_mnt(s->s_root, path.mnt);
+	root_info->lower_path = path;
 
 	s->s_flags |= MS_ACTIVE;
 	return dget(s->s_root);
-- 
cgit v1.3


From cbe9c08524a68a1bfc1ba8d84dabb985d18ff08d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 15 Sep 2013 20:54:18 -0400
Subject: ecryptfs: ->lower_path.dentry is never NULL

... on anything found via ->d_fsdata

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ecryptfs/dentry.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 8cefa8e13bcc..4000f6b3a750 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -81,8 +81,7 @@ static void ecryptfs_d_release(struct dentry *dentry)
 {
 	struct ecryptfs_dentry_info *p = dentry->d_fsdata;
 	if (p) {
-		if (p->lower_path.dentry)
-			path_put(&p->lower_path);
+		path_put(&p->lower_path);
 		call_rcu(&p->rcu, ecryptfs_dentry_free_rcu);
 	}
 }
-- 
cgit v1.3


From 12f38872224542c4683ab72a88394fc89b87ca32 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 15 Sep 2013 21:20:49 -0400
Subject: libfs: get exports to definitions of objects being exported...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/libfs.c | 69 +++++++++++++++++++++++++++++++-------------------------------
 1 file changed, 34 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/libfs.c b/fs/libfs.c
index 3a3a9b53bf5a..828622a31d30 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -31,6 +31,7 @@ int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	stat->blocks = inode->i_mapping->nrpages << (PAGE_CACHE_SHIFT - 9);
 	return 0;
 }
+EXPORT_SYMBOL(simple_getattr);
 
 int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
@@ -39,6 +40,7 @@ int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_namelen = NAME_MAX;
 	return 0;
 }
+EXPORT_SYMBOL(simple_statfs);
 
 /*
  * Retaining negative dentries for an in-memory filesystem just wastes
@@ -66,6 +68,7 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned
 	d_add(dentry, NULL);
 	return NULL;
 }
+EXPORT_SYMBOL(simple_lookup);
 
 int dcache_dir_open(struct inode *inode, struct file *file)
 {
@@ -75,12 +78,14 @@ int dcache_dir_open(struct inode *inode, struct file *file)
 
 	return file->private_data ? 0 : -ENOMEM;
 }
+EXPORT_SYMBOL(dcache_dir_open);
 
 int dcache_dir_close(struct inode *inode, struct file *file)
 {
 	dput(file->private_data);
 	return 0;
 }
+EXPORT_SYMBOL(dcache_dir_close);
 
 loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
 {
@@ -123,6 +128,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
 	mutex_unlock(&dentry->d_inode->i_mutex);
 	return offset;
 }
+EXPORT_SYMBOL(dcache_dir_lseek);
 
 /* Relationship between i_mode and the DT_xxx types */
 static inline unsigned char dt_type(struct inode *inode)
@@ -172,11 +178,13 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
 	spin_unlock(&dentry->d_lock);
 	return 0;
 }
+EXPORT_SYMBOL(dcache_readdir);
 
 ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos)
 {
 	return -EISDIR;
 }
+EXPORT_SYMBOL(generic_read_dir);
 
 const struct file_operations simple_dir_operations = {
 	.open		= dcache_dir_open,
@@ -186,10 +194,12 @@ const struct file_operations simple_dir_operations = {
 	.iterate	= dcache_readdir,
 	.fsync		= noop_fsync,
 };
+EXPORT_SYMBOL(simple_dir_operations);
 
 const struct inode_operations simple_dir_inode_operations = {
 	.lookup		= simple_lookup,
 };
+EXPORT_SYMBOL(simple_dir_inode_operations);
 
 static const struct super_operations simple_super_operations = {
 	.statfs		= simple_statfs,
@@ -244,6 +254,7 @@ Enomem:
 	deactivate_locked_super(s);
 	return ERR_PTR(-ENOMEM);
 }
+EXPORT_SYMBOL(mount_pseudo);
 
 int simple_open(struct inode *inode, struct file *file)
 {
@@ -251,6 +262,7 @@ int simple_open(struct inode *inode, struct file *file)
 		file->private_data = inode->i_private;
 	return 0;
 }
+EXPORT_SYMBOL(simple_open);
 
 int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
 {
@@ -263,6 +275,7 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
 	d_instantiate(dentry, inode);
 	return 0;
 }
+EXPORT_SYMBOL(simple_link);
 
 int simple_empty(struct dentry *dentry)
 {
@@ -283,6 +296,7 @@ out:
 	spin_unlock(&dentry->d_lock);
 	return ret;
 }
+EXPORT_SYMBOL(simple_empty);
 
 int simple_unlink(struct inode *dir, struct dentry *dentry)
 {
@@ -293,6 +307,7 @@ int simple_unlink(struct inode *dir, struct dentry *dentry)
 	dput(dentry);
 	return 0;
 }
+EXPORT_SYMBOL(simple_unlink);
 
 int simple_rmdir(struct inode *dir, struct dentry *dentry)
 {
@@ -304,6 +319,7 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry)
 	drop_nlink(dir);
 	return 0;
 }
+EXPORT_SYMBOL(simple_rmdir);
 
 int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
 		struct inode *new_dir, struct dentry *new_dentry)
@@ -330,6 +346,7 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	return 0;
 }
+EXPORT_SYMBOL(simple_rename);
 
 /**
  * simple_setattr - setattr for simple filesystem
@@ -370,6 +387,7 @@ int simple_readpage(struct file *file, struct page *page)
 	unlock_page(page);
 	return 0;
 }
+EXPORT_SYMBOL(simple_readpage);
 
 int simple_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
@@ -393,6 +411,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
 	}
 	return 0;
 }
+EXPORT_SYMBOL(simple_write_begin);
 
 /**
  * simple_write_end - .write_end helper for non-block-device FSes
@@ -444,6 +463,7 @@ int simple_write_end(struct file *file, struct address_space *mapping,
 
 	return copied;
 }
+EXPORT_SYMBOL(simple_write_end);
 
 /*
  * the inodes created here are not hashed. If you use iunique to generate
@@ -512,6 +532,7 @@ out:
 	dput(root);
 	return -ENOMEM;
 }
+EXPORT_SYMBOL(simple_fill_super);
 
 static DEFINE_SPINLOCK(pin_fs_lock);
 
@@ -534,6 +555,7 @@ int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *c
 	mntput(mnt);
 	return 0;
 }
+EXPORT_SYMBOL(simple_pin_fs);
 
 void simple_release_fs(struct vfsmount **mount, int *count)
 {
@@ -545,6 +567,7 @@ void simple_release_fs(struct vfsmount **mount, int *count)
 	spin_unlock(&pin_fs_lock);
 	mntput(mnt);
 }
+EXPORT_SYMBOL(simple_release_fs);
 
 /**
  * simple_read_from_buffer - copy data from the buffer to user space
@@ -579,6 +602,7 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
 	*ppos = pos + count;
 	return count;
 }
+EXPORT_SYMBOL(simple_read_from_buffer);
 
 /**
  * simple_write_to_buffer - copy data from user space to the buffer
@@ -613,6 +637,7 @@ ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
 	*ppos = pos + count;
 	return count;
 }
+EXPORT_SYMBOL(simple_write_to_buffer);
 
 /**
  * memory_read_from_buffer - copy data from the buffer
@@ -644,6 +669,7 @@ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
 
 	return count;
 }
+EXPORT_SYMBOL(memory_read_from_buffer);
 
 /*
  * Transaction based IO.
@@ -665,6 +691,7 @@ void simple_transaction_set(struct file *file, size_t n)
 	smp_mb();
 	ar->size = n;
 }
+EXPORT_SYMBOL(simple_transaction_set);
 
 char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
 {
@@ -696,6 +723,7 @@ char *simple_transaction_get(struct file *file, const char __user *buf, size_t s
 
 	return ar->data;
 }
+EXPORT_SYMBOL(simple_transaction_get);
 
 ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
 {
@@ -705,12 +733,14 @@ ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size
 		return 0;
 	return simple_read_from_buffer(buf, size, pos, ar->data, ar->size);
 }
+EXPORT_SYMBOL(simple_transaction_read);
 
 int simple_transaction_release(struct inode *inode, struct file *file)
 {
 	free_page((unsigned long)file->private_data);
 	return 0;
 }
+EXPORT_SYMBOL(simple_transaction_release);
 
 /* Simple attribute files */
 
@@ -746,12 +776,14 @@ int simple_attr_open(struct inode *inode, struct file *file,
 
 	return nonseekable_open(inode, file);
 }
+EXPORT_SYMBOL_GPL(simple_attr_open);
 
 int simple_attr_release(struct inode *inode, struct file *file)
 {
 	kfree(file->private_data);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(simple_attr_release);	/* GPL-only?  This?  Really? */
 
 /* read from the buffer that is filled with the get function */
 ssize_t simple_attr_read(struct file *file, char __user *buf,
@@ -787,6 +819,7 @@ out:
 	mutex_unlock(&attr->mutex);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(simple_attr_read);
 
 /* interpret the buffer as a number to call the set function with */
 ssize_t simple_attr_write(struct file *file, const char __user *buf,
@@ -819,6 +852,7 @@ out:
 	mutex_unlock(&attr->mutex);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(simple_attr_write);
 
 /**
  * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
@@ -957,39 +991,4 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	return 0;
 }
-
-EXPORT_SYMBOL(dcache_dir_close);
-EXPORT_SYMBOL(dcache_dir_lseek);
-EXPORT_SYMBOL(dcache_dir_open);
-EXPORT_SYMBOL(dcache_readdir);
-EXPORT_SYMBOL(generic_read_dir);
-EXPORT_SYMBOL(mount_pseudo);
-EXPORT_SYMBOL(simple_write_begin);
-EXPORT_SYMBOL(simple_write_end);
-EXPORT_SYMBOL(simple_dir_inode_operations);
-EXPORT_SYMBOL(simple_dir_operations);
-EXPORT_SYMBOL(simple_empty);
-EXPORT_SYMBOL(simple_fill_super);
-EXPORT_SYMBOL(simple_getattr);
-EXPORT_SYMBOL(simple_open);
-EXPORT_SYMBOL(simple_link);
-EXPORT_SYMBOL(simple_lookup);
-EXPORT_SYMBOL(simple_pin_fs);
-EXPORT_SYMBOL(simple_readpage);
-EXPORT_SYMBOL(simple_release_fs);
-EXPORT_SYMBOL(simple_rename);
-EXPORT_SYMBOL(simple_rmdir);
-EXPORT_SYMBOL(simple_statfs);
 EXPORT_SYMBOL(noop_fsync);
-EXPORT_SYMBOL(simple_unlink);
-EXPORT_SYMBOL(simple_read_from_buffer);
-EXPORT_SYMBOL(simple_write_to_buffer);
-EXPORT_SYMBOL(memory_read_from_buffer);
-EXPORT_SYMBOL(simple_transaction_set);
-EXPORT_SYMBOL(simple_transaction_get);
-EXPORT_SYMBOL(simple_transaction_read);
-EXPORT_SYMBOL(simple_transaction_release);
-EXPORT_SYMBOL_GPL(simple_attr_open);
-EXPORT_SYMBOL_GPL(simple_attr_release);
-EXPORT_SYMBOL_GPL(simple_attr_read);
-EXPORT_SYMBOL_GPL(simple_attr_write);
-- 
cgit v1.3


From 87dc800be2499128efb3a6f059d75dc8e1e6d503 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 16 Sep 2013 10:30:04 -0400
Subject: new helper: kfree_put_link()

duplicated to hell and back...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/cifs/cifsfs.c    |  2 +-
 fs/cifs/cifsfs.h    |  2 --
 fs/cifs/link.c      |  7 -------
 fs/ecryptfs/inode.c | 12 +-----------
 fs/gfs2/inode.c     |  9 +--------
 fs/libfs.c          | 10 ++++++++++
 fs/proc/self.c      | 10 +---------
 include/linux/fs.h  |  1 +
 8 files changed, 15 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index a16b4e58bcc6..801975c34cf9 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -860,7 +860,7 @@ const struct inode_operations cifs_file_inode_ops = {
 const struct inode_operations cifs_symlink_inode_ops = {
 	.readlink = generic_readlink,
 	.follow_link = cifs_follow_link,
-	.put_link = cifs_put_link,
+	.put_link = kfree_put_link,
 	.permission = cifs_permission,
 	/* BB add the following two eventually */
 	/* revalidate: cifs_revalidate,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 6d0b07217ac9..26a754f49ba1 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -115,8 +115,6 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
 
 /* Functions related to symlinks */
 extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
-extern void cifs_put_link(struct dentry *direntry,
-			  struct nameidata *nd, void *);
 extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
 			 int buflen);
 extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 7e36ceba0c7a..cc0234710ddb 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -621,10 +621,3 @@ symlink_exit:
 	free_xid(xid);
 	return rc;
 }
-
-void cifs_put_link(struct dentry *direntry, struct nameidata *nd, void *cookie)
-{
-	char *p = nd_get_link(nd);
-	if (!IS_ERR(p))
-		kfree(p);
-}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 2ca62c40c6c2..0f9b66eaa767 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -703,16 +703,6 @@ out:
 	return NULL;
 }
 
-static void
-ecryptfs_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr)
-{
-	char *buf = nd_get_link(nd);
-	if (!IS_ERR(buf)) {
-		/* Free the char* */
-		kfree(buf);
-	}
-}
-
 /**
  * upper_size_to_lower_size
  * @crypt_stat: Crypt_stat associated with file
@@ -1121,7 +1111,7 @@ out:
 const struct inode_operations ecryptfs_symlink_iops = {
 	.readlink = generic_readlink,
 	.follow_link = ecryptfs_follow_link,
-	.put_link = ecryptfs_put_link,
+	.put_link = kfree_put_link,
 	.permission = ecryptfs_permission,
 	.setattr = ecryptfs_setattr,
 	.getattr = ecryptfs_getattr_link,
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index ced3257f06e8..d2384f7c53e1 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1506,13 +1506,6 @@ out:
 	return NULL;
 }
 
-static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
-{
-	char *s = nd_get_link(nd);
-	if (!IS_ERR(s))
-		kfree(s);
-}
-
 /**
  * gfs2_permission -
  * @inode: The inode
@@ -1864,7 +1857,7 @@ const struct inode_operations gfs2_dir_iops = {
 const struct inode_operations gfs2_symlink_iops = {
 	.readlink = generic_readlink,
 	.follow_link = gfs2_follow_link,
-	.put_link = gfs2_put_link,
+	.put_link = kfree_put_link,
 	.permission = gfs2_permission,
 	.setattr = gfs2_setattr,
 	.getattr = gfs2_getattr,
diff --git a/fs/libfs.c b/fs/libfs.c
index 828622a31d30..8c5018493154 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -10,6 +10,7 @@
 #include <linux/vfs.h>
 #include <linux/quotaops.h>
 #include <linux/mutex.h>
+#include <linux/namei.h>
 #include <linux/exportfs.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h> /* sync_mapping_buffers */
@@ -992,3 +993,12 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	return 0;
 }
 EXPORT_SYMBOL(noop_fsync);
+
+void kfree_put_link(struct dentry *dentry, struct nameidata *nd,
+				void *cookie)
+{
+	char *s = nd_get_link(nd);
+	if (!IS_ERR(s))
+		kfree(s);
+}
+EXPORT_SYMBOL(kfree_put_link);
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 6b6a993b5c25..ffeb202ec942 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -36,18 +36,10 @@ static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
 	return NULL;
 }
 
-static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
-				void *cookie)
-{
-	char *s = nd_get_link(nd);
-	if (!IS_ERR(s))
-		kfree(s);
-}
-
 static const struct inode_operations proc_self_inode_operations = {
 	.readlink	= proc_self_readlink,
 	.follow_link	= proc_self_follow_link,
-	.put_link	= proc_self_put_link,
+	.put_link	= kfree_put_link,
 };
 
 static unsigned self_inum;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3f40547ba191..d80c2437d624 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2502,6 +2502,7 @@ extern int __page_symlink(struct inode *inode, const char *symname, int len,
 		int nofs);
 extern int page_symlink(struct inode *inode, const char *symname, int len);
 extern const struct inode_operations page_symlink_inode_operations;
+extern void kfree_put_link(struct dentry *, struct nameidata *, void *);
 extern int generic_readlink(struct dentry *, char __user *, int);
 extern void generic_fillattr(struct inode *, struct kstat *);
 extern int vfs_getattr(struct path *, struct kstat *);
-- 
cgit v1.3


From 48bc06e74be178968b53d339dbcb110cd2bb16ea Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 16 Sep 2013 10:35:31 -0400
Subject: befs: split symlink iops in two - for short and long symlinks resp.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/befs/linuxvfs.c | 61 +++++++++++++++++++++++++++---------------------------
 1 file changed, 31 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index e9c75e20db32..daa15d6ba450 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -42,7 +42,7 @@ static void befs_destroy_inode(struct inode *inode);
 static int befs_init_inodecache(void);
 static void befs_destroy_inodecache(void);
 static void *befs_follow_link(struct dentry *, struct nameidata *);
-static void befs_put_link(struct dentry *, struct nameidata *, void *);
+static void *befs_fast_follow_link(struct dentry *, struct nameidata *);
 static int befs_utf2nls(struct super_block *sb, const char *in, int in_len,
 			char **out, int *out_len);
 static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
@@ -79,10 +79,15 @@ static const struct address_space_operations befs_aops = {
 	.bmap		= befs_bmap,
 };
 
+static const struct inode_operations befs_fast_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= befs_fast_follow_link,
+};
+
 static const struct inode_operations befs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= befs_follow_link,
-	.put_link	= befs_put_link,
+	.put_link	= kfree_put_link,
 };
 
 /* 
@@ -411,7 +416,10 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
 		inode->i_op = &befs_dir_inode_operations;
 		inode->i_fop = &befs_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
-		inode->i_op = &befs_symlink_inode_operations;
+		if (befs_ino->i_flags & BEFS_LONG_SYMLINK)
+			inode->i_op = &befs_symlink_inode_operations;
+		else
+			inode->i_op = &befs_fast_symlink_inode_operations;
 	} else {
 		befs_error(sb, "Inode %lu is not a regular file, "
 			   "directory or symlink. THAT IS WRONG! BeFS has no "
@@ -477,47 +485,40 @@ befs_destroy_inodecache(void)
 static void *
 befs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
+	struct super_block *sb = dentry->d_sb;
 	befs_inode_info *befs_ino = BEFS_I(dentry->d_inode);
+	befs_data_stream *data = &befs_ino->i_data.ds;
+	befs_off_t len = data->size;
 	char *link;
 
-	if (befs_ino->i_flags & BEFS_LONG_SYMLINK) {
-		struct super_block *sb = dentry->d_sb;
-		befs_data_stream *data = &befs_ino->i_data.ds;
-		befs_off_t len = data->size;
+	if (len == 0) {
+		befs_error(sb, "Long symlink with illegal length");
+		link = ERR_PTR(-EIO);
+	} else {
+		befs_debug(sb, "Follow long symlink");
 
-		if (len == 0) {
-			befs_error(sb, "Long symlink with illegal length");
+		link = kmalloc(len, GFP_NOFS);
+		if (!link) {
+			link = ERR_PTR(-ENOMEM);
+		} else if (befs_read_lsymlink(sb, data, link, len) != len) {
+			kfree(link);
+			befs_error(sb, "Failed to read entire long symlink");
 			link = ERR_PTR(-EIO);
 		} else {
-			befs_debug(sb, "Follow long symlink");
-
-			link = kmalloc(len, GFP_NOFS);
-			if (!link) {
-				link = ERR_PTR(-ENOMEM);
-			} else if (befs_read_lsymlink(sb, data, link, len) != len) {
-				kfree(link);
-				befs_error(sb, "Failed to read entire long symlink");
-				link = ERR_PTR(-EIO);
-			} else {
-				link[len - 1] = '\0';
-			}
+			link[len - 1] = '\0';
 		}
-	} else {
-		link = befs_ino->i_data.symlink;
 	}
-
 	nd_set_link(nd, link);
 	return NULL;
 }
 
-static void befs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+
+static void *
+befs_fast_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	befs_inode_info *befs_ino = BEFS_I(dentry->d_inode);
-	if (befs_ino->i_flags & BEFS_LONG_SYMLINK) {
-		char *link = nd_get_link(nd);
-		if (!IS_ERR(link))
-			kfree(link);
-	}
+	nd_set_link(nd, befs_ino->i_data.symlink);
+	return NULL;
 }
 
 /*
-- 
cgit v1.3


From 6de1472f1a4a3bd912f515f29d3cf52a65a4c718 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 16 Sep 2013 10:53:17 -0400
Subject: nfs: use %p[dD] instead of open-coded (and often racy) equivalents

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/dir.c           | 119 ++++++++++++++++++++-----------------------------
 fs/nfs/direct.c        |  17 +++----
 fs/nfs/file.c          | 117 ++++++++++++++++++------------------------------
 fs/nfs/namespace.c     |   5 +--
 fs/nfs/nfs3proc.c      |   8 ++--
 fs/nfs/nfs4file.c      |   4 +-
 fs/nfs/nfs4namespace.c |   7 ++-
 fs/nfs/nfs4proc.c      |   5 +--
 fs/nfs/proc.c          |   8 ++--
 fs/nfs/unlink.c        |   9 ++--
 fs/nfs/write.c         |   6 +--
 11 files changed, 119 insertions(+), 186 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 02b0df769e2d..8b3dd7174fa2 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -98,9 +98,7 @@ nfs_opendir(struct inode *inode, struct file *filp)
 	struct nfs_open_dir_context *ctx;
 	struct rpc_cred *cred;
 
-	dfprintk(FILE, "NFS: open dir(%s/%s)\n",
-			filp->f_path.dentry->d_parent->d_name.name,
-			filp->f_path.dentry->d_name.name);
+	dfprintk(FILE, "NFS: open dir(%pD2)\n", filp);
 
 	nfs_inc_stats(inode, NFSIOS_VFSOPEN);
 
@@ -297,11 +295,10 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
 				if (ctx->duped > 0
 				    && ctx->dup_cookie == *desc->dir_cookie) {
 					if (printk_ratelimit()) {
-						pr_notice("NFS: directory %s/%s contains a readdir loop."
+						pr_notice("NFS: directory %pD2 contains a readdir loop."
 								"Please contact your server vendor.  "
 								"The file: %s has duplicate cookie %llu\n",
-								desc->file->f_dentry->d_parent->d_name.name,
-								desc->file->f_dentry->d_name.name,
+								desc->file,
 								array->array[i].string.name,
 								*desc->dir_cookie);
 					}
@@ -822,9 +819,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	struct nfs_open_dir_context *dir_ctx = file->private_data;
 	int res = 0;
 
-	dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
-			dentry->d_parent->d_name.name, dentry->d_name.name,
-			(long long)ctx->pos);
+	dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
+			file, (long long)ctx->pos);
 	nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
 
 	/*
@@ -880,22 +876,17 @@ out:
 	nfs_unblock_sillyrename(dentry);
 	if (res > 0)
 		res = 0;
-	dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
-			dentry->d_parent->d_name.name, dentry->d_name.name,
-			res);
+	dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res);
 	return res;
 }
 
 static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
 {
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct nfs_open_dir_context *dir_ctx = filp->private_data;
 
-	dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n",
-			dentry->d_parent->d_name.name,
-			dentry->d_name.name,
-			offset, whence);
+	dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n",
+			filp, offset, whence);
 
 	mutex_lock(&inode->i_mutex);
 	switch (whence) {
@@ -925,15 +916,12 @@ out:
 static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,
 			 int datasync)
 {
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 
-	dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
-			dentry->d_parent->d_name.name, dentry->d_name.name,
-			datasync);
+	dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync);
 
 	mutex_lock(&inode->i_mutex);
-	nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC);
+	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
 	mutex_unlock(&inode->i_mutex);
 	return 0;
 }
@@ -1073,9 +1061,8 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 	}
 
 	if (is_bad_inode(inode)) {
-		dfprintk(LOOKUPCACHE, "%s: %s/%s has dud inode\n",
-				__func__, dentry->d_parent->d_name.name,
-				dentry->d_name.name);
+		dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n",
+				__func__, dentry);
 		goto out_bad;
 	}
 
@@ -1125,9 +1112,8 @@ out_set_verifier:
 	nfs_advise_use_readdirplus(dir);
  out_valid_noent:
 	dput(parent);
-	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n",
-			__func__, dentry->d_parent->d_name.name,
-			dentry->d_name.name);
+	dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
+			__func__, dentry);
 	return 1;
 out_zap_parent:
 	nfs_zap_caches(dir);
@@ -1147,18 +1133,16 @@ out_zap_parent:
 		goto out_valid;
 
 	dput(parent);
-	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
-			__func__, dentry->d_parent->d_name.name,
-			dentry->d_name.name);
+	dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n",
+			__func__, dentry);
 	return 0;
 out_error:
 	nfs_free_fattr(fattr);
 	nfs_free_fhandle(fhandle);
 	nfs4_label_free(label);
 	dput(parent);
-	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",
-			__func__, dentry->d_parent->d_name.name,
-			dentry->d_name.name, error);
+	dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n",
+			__func__, dentry, error);
 	return error;
 }
 
@@ -1182,16 +1166,14 @@ static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags)
 	 * eventually need to do something more here.
 	 */
 	if (!inode) {
-		dfprintk(LOOKUPCACHE, "%s: %s/%s has negative inode\n",
-				__func__, dentry->d_parent->d_name.name,
-				dentry->d_name.name);
+		dfprintk(LOOKUPCACHE, "%s: %pd2 has negative inode\n",
+				__func__, dentry);
 		return 1;
 	}
 
 	if (is_bad_inode(inode)) {
-		dfprintk(LOOKUPCACHE, "%s: %s/%s has dud inode\n",
-				__func__, dentry->d_parent->d_name.name,
-				dentry->d_name.name);
+		dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n",
+				__func__, dentry);
 		return 0;
 	}
 
@@ -1206,9 +1188,8 @@ static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags)
  */
 static int nfs_dentry_delete(const struct dentry *dentry)
 {
-	dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name,
-		dentry->d_flags);
+	dfprintk(VFS, "NFS: dentry_delete(%pd2, %x)\n",
+		dentry, dentry->d_flags);
 
 	/* Unhash any dentry with a stale inode */
 	if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode))
@@ -1286,8 +1267,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
 	struct nfs4_label *label = NULL;
 	int error;
 
-	dfprintk(VFS, "NFS: lookup(%s/%s)\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name);
+	dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry);
 	nfs_inc_stats(dir, NFSIOS_VFSLOOKUP);
 
 	res = ERR_PTR(-ENAMETOOLONG);
@@ -1418,8 +1398,8 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
 	/* Expect a negative dentry */
 	BUG_ON(dentry->d_inode);
 
-	dfprintk(VFS, "NFS: atomic_open(%s/%ld), %s\n",
-			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
+	dfprintk(VFS, "NFS: atomic_open(%s/%ld), %pd\n",
+			dir->i_sb->s_id, dir->i_ino, dentry);
 
 	err = nfs_check_flags(open_flags);
 	if (err)
@@ -1608,8 +1588,8 @@ int nfs_create(struct inode *dir, struct dentry *dentry,
 	int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT;
 	int error;
 
-	dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
-			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
+	dfprintk(VFS, "NFS: create(%s/%ld), %pd\n",
+			dir->i_sb->s_id, dir->i_ino, dentry);
 
 	attr.ia_mode = mode;
 	attr.ia_valid = ATTR_MODE;
@@ -1635,8 +1615,8 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
 	struct iattr attr;
 	int status;
 
-	dfprintk(VFS, "NFS: mknod(%s/%ld), %s\n",
-			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
+	dfprintk(VFS, "NFS: mknod(%s/%ld), %pd\n",
+			dir->i_sb->s_id, dir->i_ino, dentry);
 
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
@@ -1664,8 +1644,8 @@ int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	struct iattr attr;
 	int error;
 
-	dfprintk(VFS, "NFS: mkdir(%s/%ld), %s\n",
-			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
+	dfprintk(VFS, "NFS: mkdir(%s/%ld), %pd\n",
+			dir->i_sb->s_id, dir->i_ino, dentry);
 
 	attr.ia_valid = ATTR_MODE;
 	attr.ia_mode = mode | S_IFDIR;
@@ -1692,8 +1672,8 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	int error;
 
-	dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n",
-			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
+	dfprintk(VFS, "NFS: rmdir(%s/%ld), %pd\n",
+			dir->i_sb->s_id, dir->i_ino, dentry);
 
 	trace_nfs_rmdir_enter(dir, dentry);
 	if (dentry->d_inode) {
@@ -1728,8 +1708,7 @@ static int nfs_safe_remove(struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	int error = -EBUSY;
 		
-	dfprintk(VFS, "NFS: safe_remove(%s/%s)\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name);
+	dfprintk(VFS, "NFS: safe_remove(%pd2)\n", dentry);
 
 	/* If the dentry was sillyrenamed, we simply call d_delete() */
 	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
@@ -1762,8 +1741,8 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
 	int error;
 	int need_rehash = 0;
 
-	dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
-		dir->i_ino, dentry->d_name.name);
+	dfprintk(VFS, "NFS: unlink(%s/%ld, %pd)\n", dir->i_sb->s_id,
+		dir->i_ino, dentry);
 
 	trace_nfs_unlink_enter(dir, dentry);
 	spin_lock(&dentry->d_lock);
@@ -1813,8 +1792,8 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 	unsigned int pathlen = strlen(symname);
 	int error;
 
-	dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s)\n", dir->i_sb->s_id,
-		dir->i_ino, dentry->d_name.name, symname);
+	dfprintk(VFS, "NFS: symlink(%s/%ld, %pd, %s)\n", dir->i_sb->s_id,
+		dir->i_ino, dentry, symname);
 
 	if (pathlen > PAGE_SIZE)
 		return -ENAMETOOLONG;
@@ -1836,9 +1815,9 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 	error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
 	trace_nfs_symlink_exit(dir, dentry, error);
 	if (error != 0) {
-		dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n",
+		dfprintk(VFS, "NFS: symlink(%s/%ld, %pd, %s) error %d\n",
 			dir->i_sb->s_id, dir->i_ino,
-			dentry->d_name.name, symname, error);
+			dentry, symname, error);
 		d_drop(dentry);
 		__free_page(page);
 		return error;
@@ -1865,9 +1844,8 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
 	struct inode *inode = old_dentry->d_inode;
 	int error;
 
-	dfprintk(VFS, "NFS: link(%s/%s -> %s/%s)\n",
-		old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
-		dentry->d_parent->d_name.name, dentry->d_name.name);
+	dfprintk(VFS, "NFS: link(%pd2 -> %pd2)\n",
+		old_dentry, dentry);
 
 	trace_nfs_link_enter(inode, dir, dentry);
 	NFS_PROTO(inode)->return_delegation(inode);
@@ -1915,9 +1893,8 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct dentry *dentry = NULL, *rehash = NULL;
 	int error = -EBUSY;
 
-	dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
-		 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
-		 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
+	dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n",
+		 old_dentry, new_dentry,
 		 d_count(new_dentry));
 
 	trace_nfs_rename_enter(old_dir, old_dentry, new_dir, new_dentry);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 91ff089d3412..d71d66c9e0a1 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -124,9 +124,8 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
 ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
 {
 #ifndef CONFIG_NFS_SWAP
-	dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
-			iocb->ki_filp->f_path.dentry->d_name.name,
-			(long long) pos, nr_segs);
+	dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n",
+			iocb->ki_filp, (long long) pos, nr_segs);
 
 	return -EINVAL;
 #else
@@ -909,10 +908,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
 	count = iov_length(iov, nr_segs);
 	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
 
-	dfprintk(FILE, "NFS: direct read(%s/%s, %zd@%Ld)\n",
-		file->f_path.dentry->d_parent->d_name.name,
-		file->f_path.dentry->d_name.name,
-		count, (long long) pos);
+	dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
+		file, count, (long long) pos);
 
 	retval = 0;
 	if (!count)
@@ -965,10 +962,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	count = iov_length(iov, nr_segs);
 	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
 
-	dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n",
-		file->f_path.dentry->d_parent->d_name.name,
-		file->f_path.dentry->d_name.name,
-		count, (long long) pos);
+	dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
+		file, count, (long long) pos);
 
 	retval = generic_write_checks(file, &pos, &count, 0);
 	if (retval)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 1e6bfdbc1aff..e2fcacf07de3 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -65,9 +65,7 @@ nfs_file_open(struct inode *inode, struct file *filp)
 {
 	int res;
 
-	dprintk("NFS: open file(%s/%s)\n",
-			filp->f_path.dentry->d_parent->d_name.name,
-			filp->f_path.dentry->d_name.name);
+	dprintk("NFS: open file(%pD2)\n", filp);
 
 	nfs_inc_stats(inode, NFSIOS_VFSOPEN);
 	res = nfs_check_flags(filp->f_flags);
@@ -81,9 +79,7 @@ nfs_file_open(struct inode *inode, struct file *filp)
 int
 nfs_file_release(struct inode *inode, struct file *filp)
 {
-	dprintk("NFS: release(%s/%s)\n",
-			filp->f_path.dentry->d_parent->d_name.name,
-			filp->f_path.dentry->d_name.name);
+	dprintk("NFS: release(%pD2)\n", filp);
 
 	nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
 	return nfs_release(inode, filp);
@@ -123,10 +119,8 @@ force_reval:
 
 loff_t nfs_file_llseek(struct file *filp, loff_t offset, int whence)
 {
-	dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
-			filp->f_path.dentry->d_parent->d_name.name,
-			filp->f_path.dentry->d_name.name,
-			offset, whence);
+	dprintk("NFS: llseek file(%pD2, %lld, %d)\n",
+			filp, offset, whence);
 
 	/*
 	 * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
@@ -150,12 +144,9 @@ EXPORT_SYMBOL_GPL(nfs_file_llseek);
 int
 nfs_file_flush(struct file *file, fl_owner_t id)
 {
-	struct dentry	*dentry = file->f_path.dentry;
-	struct inode	*inode = dentry->d_inode;
+	struct inode	*inode = file_inode(file);
 
-	dprintk("NFS: flush(%s/%s)\n",
-			dentry->d_parent->d_name.name,
-			dentry->d_name.name);
+	dprintk("NFS: flush(%pD2)\n", file);
 
 	nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
 	if ((file->f_mode & FMODE_WRITE) == 0)
@@ -177,15 +168,14 @@ ssize_t
 nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos)
 {
-	struct dentry * dentry = iocb->ki_filp->f_path.dentry;
-	struct inode * inode = dentry->d_inode;
+	struct inode *inode = file_inode(iocb->ki_filp);
 	ssize_t result;
 
 	if (iocb->ki_filp->f_flags & O_DIRECT)
 		return nfs_file_direct_read(iocb, iov, nr_segs, pos, true);
 
-	dprintk("NFS: read(%s/%s, %lu@%lu)\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name,
+	dprintk("NFS: read(%pD2, %lu@%lu)\n",
+		iocb->ki_filp,
 		(unsigned long) iov_length(iov, nr_segs), (unsigned long) pos);
 
 	result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
@@ -203,13 +193,11 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
 		     struct pipe_inode_info *pipe, size_t count,
 		     unsigned int flags)
 {
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	ssize_t res;
 
-	dprintk("NFS: splice_read(%s/%s, %lu@%Lu)\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name,
-		(unsigned long) count, (unsigned long long) *ppos);
+	dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n",
+		filp, (unsigned long) count, (unsigned long long) *ppos);
 
 	res = nfs_revalidate_mapping(inode, filp->f_mapping);
 	if (!res) {
@@ -224,12 +212,10 @@ EXPORT_SYMBOL_GPL(nfs_file_splice_read);
 int
 nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
 {
-	struct dentry *dentry = file->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	int	status;
 
-	dprintk("NFS: mmap(%s/%s)\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name);
+	dprintk("NFS: mmap(%pD2)\n", file);
 
 	/* Note: generic_file_mmap() returns ENOSYS on nommu systems
 	 *       so we call that before revalidating the mapping
@@ -258,15 +244,12 @@ EXPORT_SYMBOL_GPL(nfs_file_mmap);
 int
 nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
 {
-	struct dentry *dentry = file->f_path.dentry;
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	int have_error, do_resend, status;
 	int ret = 0;
 
-	dprintk("NFS: fsync file(%s/%s) datasync %d\n",
-			dentry->d_parent->d_name.name, dentry->d_name.name,
-			datasync);
+	dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync);
 
 	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
 	do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
@@ -371,10 +354,8 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 	struct page *page;
 	int once_thru = 0;
 
-	dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
-		file->f_path.dentry->d_parent->d_name.name,
-		file->f_path.dentry->d_name.name,
-		mapping->host->i_ino, len, (long long) pos);
+	dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%ld), %u@%lld)\n",
+		file, mapping->host->i_ino, len, (long long) pos);
 
 start:
 	/*
@@ -414,10 +395,8 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	int status;
 
-	dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
-		file->f_path.dentry->d_parent->d_name.name,
-		file->f_path.dentry->d_name.name,
-		mapping->host->i_ino, len, (long long) pos);
+	dfprintk(PAGECACHE, "NFS: write_end(%pD2(%ld), %u@%lld)\n",
+		file, mapping->host->i_ino, len, (long long) pos);
 
 	/*
 	 * Zero any uninitialised parts of the page, and then mark the page
@@ -601,22 +580,21 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
 	struct file *filp = vma->vm_file;
-	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = file_inode(filp);
 	unsigned pagelen;
 	int ret = VM_FAULT_NOPAGE;
 	struct address_space *mapping;
 
-	dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name,
-		filp->f_mapping->host->i_ino,
+	dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%ld), offset %lld)\n",
+		filp, filp->f_mapping->host->i_ino,
 		(long long)page_offset(page));
 
 	/* make sure the cache has finished storing the page */
-	nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
+	nfs_fscache_wait_on_page_write(NFS_I(inode), page);
 
 	lock_page(page);
 	mapping = page_file_mapping(page);
-	if (mapping != dentry->d_inode->i_mapping)
+	if (mapping != inode->i_mapping)
 		goto out_unlock;
 
 	wait_on_page_writeback(page);
@@ -659,22 +637,21 @@ static int nfs_need_sync_write(struct file *filp, struct inode *inode)
 ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
 		       unsigned long nr_segs, loff_t pos)
 {
-	struct dentry * dentry = iocb->ki_filp->f_path.dentry;
-	struct inode * inode = dentry->d_inode;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
 	unsigned long written = 0;
 	ssize_t result;
 	size_t count = iov_length(iov, nr_segs);
 
-	result = nfs_key_timeout_notify(iocb->ki_filp, inode);
+	result = nfs_key_timeout_notify(file, inode);
 	if (result)
 		return result;
 
-	if (iocb->ki_filp->f_flags & O_DIRECT)
+	if (file->f_flags & O_DIRECT)
 		return nfs_file_direct_write(iocb, iov, nr_segs, pos, true);
 
-	dprintk("NFS: write(%s/%s, %lu@%Ld)\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name,
-		(unsigned long) count, (long long) pos);
+	dprintk("NFS: write(%pD2, %lu@%Ld)\n",
+		file, (unsigned long) count, (long long) pos);
 
 	result = -EBUSY;
 	if (IS_SWAPFILE(inode))
@@ -682,8 +659,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
 	/*
 	 * O_APPEND implies that we must revalidate the file length.
 	 */
-	if (iocb->ki_filp->f_flags & O_APPEND) {
-		result = nfs_revalidate_file_size(inode, iocb->ki_filp);
+	if (file->f_flags & O_APPEND) {
+		result = nfs_revalidate_file_size(inode, file);
 		if (result)
 			goto out;
 	}
@@ -697,8 +674,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
 		written = result;
 
 	/* Return error values for O_DSYNC and IS_SYNC() */
-	if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
-		int err = vfs_fsync(iocb->ki_filp, 0);
+	if (result >= 0 && nfs_need_sync_write(file, inode)) {
+		int err = vfs_fsync(file, 0);
 		if (err < 0)
 			result = err;
 	}
@@ -717,14 +694,12 @@ ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
 			      struct file *filp, loff_t *ppos,
 			      size_t count, unsigned int flags)
 {
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	unsigned long written = 0;
 	ssize_t ret;
 
-	dprintk("NFS splice_write(%s/%s, %lu@%llu)\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name,
-		(unsigned long) count, (unsigned long long) *ppos);
+	dprintk("NFS splice_write(%pD2, %lu@%llu)\n",
+		filp, (unsigned long) count, (unsigned long long) *ppos);
 
 	/*
 	 * The combination of splice and an O_APPEND destination is disallowed.
@@ -883,10 +858,8 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
 	int ret = -ENOLCK;
 	int is_local = 0;
 
-	dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
-			filp->f_path.dentry->d_parent->d_name.name,
-			filp->f_path.dentry->d_name.name,
-			fl->fl_type, fl->fl_flags,
+	dprintk("NFS: lock(%pD2, t=%x, fl=%x, r=%lld:%lld)\n",
+			filp, fl->fl_type, fl->fl_flags,
 			(long long)fl->fl_start, (long long)fl->fl_end);
 
 	nfs_inc_stats(inode, NFSIOS_VFSLOCK);
@@ -923,10 +896,8 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
 	struct inode *inode = filp->f_mapping->host;
 	int is_local = 0;
 
-	dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
-			filp->f_path.dentry->d_parent->d_name.name,
-			filp->f_path.dentry->d_name.name,
-			fl->fl_type, fl->fl_flags);
+	dprintk("NFS: flock(%pD2, t=%x, fl=%x)\n",
+			filp, fl->fl_type, fl->fl_flags);
 
 	if (!(fl->fl_flags & FL_FLOCK))
 		return -ENOLCK;
@@ -960,9 +931,7 @@ EXPORT_SYMBOL_GPL(nfs_flock);
  */
 int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
 {
-	dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
-			file->f_path.dentry->d_parent->d_name.name,
-			file->f_path.dentry->d_name.name, arg);
+	dprintk("NFS: setlease(%pD2, arg=%ld)\n", file, arg);
 	return -EINVAL;
 }
 EXPORT_SYMBOL_GPL(nfs_setlease);
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 348b535cd786..b5a0afc3ee10 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -253,9 +253,8 @@ struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh,
 
 	dprintk("--> nfs_do_submount()\n");
 
-	dprintk("%s: submounting on %s/%s\n", __func__,
-			dentry->d_parent->d_name.name,
-			dentry->d_name.name);
+	dprintk("%s: submounting on %pd2\n", __func__,
+			dentry);
 	if (page == NULL)
 		goto out;
 	devname = nfs_devname(dentry, page, PAGE_SIZE);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 90cb10d7b693..01b6f6a49d16 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -321,7 +321,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	umode_t mode = sattr->ia_mode;
 	int status = -ENOMEM;
 
-	dprintk("NFS call  create %s\n", dentry->d_name.name);
+	dprintk("NFS call  create %pd\n", dentry);
 
 	data = nfs3_alloc_createdata();
 	if (data == NULL)
@@ -548,7 +548,7 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
 	if (len > NFS3_MAXPATHLEN)
 		return -ENAMETOOLONG;
 
-	dprintk("NFS call  symlink %s\n", dentry->d_name.name);
+	dprintk("NFS call  symlink %pd\n", dentry);
 
 	data = nfs3_alloc_createdata();
 	if (data == NULL)
@@ -576,7 +576,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
 	umode_t mode = sattr->ia_mode;
 	int status = -ENOMEM;
 
-	dprintk("NFS call  mkdir %s\n", dentry->d_name.name);
+	dprintk("NFS call  mkdir %pd\n", dentry);
 
 	sattr->ia_mode &= ~current_umask();
 
@@ -695,7 +695,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	umode_t mode = sattr->ia_mode;
 	int status = -ENOMEM;
 
-	dprintk("NFS call  mknod %s %u:%u\n", dentry->d_name.name,
+	dprintk("NFS call  mknod %pd %u:%u\n", dentry,
 			MAJOR(rdev), MINOR(rdev));
 
 	sattr->ia_mode &= ~current_umask();
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 77efaf15ec90..9c8f09a2156f 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -31,9 +31,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 	 * -EOPENSTALE.  The VFS will retry the lookup/create/open.
 	 */
 
-	dprintk("NFS: open file(%s/%s)\n",
-		dentry->d_parent->d_name.name,
-		dentry->d_name.name);
+	dprintk("NFS: open file(%pd2)\n", dentry);
 
 	if ((openflags & O_ACCMODE) == 3)
 		openflags--;
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 2288cd3c9278..049b9fb0d2c9 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -283,8 +283,7 @@ static struct vfsmount *nfs_follow_referral(struct dentry *dentry,
 	if (locations == NULL || locations->nlocations <= 0)
 		goto out;
 
-	dprintk("%s: referral at %s/%s\n", __func__,
-		dentry->d_parent->d_name.name, dentry->d_name.name);
+	dprintk("%s: referral at %pd2\n", __func__, dentry);
 
 	page = (char *) __get_free_page(GFP_USER);
 	if (!page)
@@ -348,8 +347,8 @@ static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *
 	mnt = ERR_PTR(-ENOENT);
 
 	parent = dget_parent(dentry);
-	dprintk("%s: getting locations for %s/%s\n",
-		__func__, parent->d_name.name, dentry->d_name.name);
+	dprintk("%s: getting locations for %pd2\n",
+		__func__, dentry);
 
 	err = nfs4_proc_fs_locations(client, parent->d_inode, &dentry->d_name, fs_locations, page);
 	dput(parent);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d53d6785cba2..75e46bbf7f41 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3738,9 +3738,8 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 	};
 	int			status;
 
-	dprintk("%s: dentry = %s/%s, cookie = %Lu\n", __func__,
-			dentry->d_parent->d_name.name,
-			dentry->d_name.name,
+	dprintk("%s: dentry = %pd2, cookie = %Lu\n", __func__,
+			dentry,
 			(unsigned long long)cookie);
 	nfs4_setup_readdir(cookie, NFS_I(dir)->cookieverf, dentry, &args);
 	res.pgbase = args.pgbase;
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index a8f57c728df5..fddbba2d9eff 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -235,7 +235,7 @@ nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	};
 	int status = -ENOMEM;
 
-	dprintk("NFS call  create %s\n", dentry->d_name.name);
+	dprintk("NFS call  create %pd\n", dentry);
 	data = nfs_alloc_createdata(dir, dentry, sattr);
 	if (data == NULL)
 		goto out;
@@ -265,7 +265,7 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	umode_t mode;
 	int status = -ENOMEM;
 
-	dprintk("NFS call  mknod %s\n", dentry->d_name.name);
+	dprintk("NFS call  mknod %pd\n", dentry);
 
 	mode = sattr->ia_mode;
 	if (S_ISFIFO(mode)) {
@@ -423,7 +423,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
 	};
 	int status = -ENAMETOOLONG;
 
-	dprintk("NFS call  symlink %s\n", dentry->d_name.name);
+	dprintk("NFS call  symlink %pd\n", dentry);
 
 	if (len > NFS2_MAXPATHLEN)
 		goto out;
@@ -462,7 +462,7 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
 	};
 	int status = -ENOMEM;
 
-	dprintk("NFS call  mkdir %s\n", dentry->d_name.name);
+	dprintk("NFS call  mkdir %pd\n", dentry);
 	data = nfs_alloc_createdata(dir, dentry, sattr);
 	if (data == NULL)
 		goto out;
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index bb939edd4c99..8285de9eaad2 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -495,9 +495,8 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 	struct rpc_task *task;
 	int            error = -EIO;
 
-	dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name,
-		d_count(dentry));
+	dfprintk(VFS, "NFS: silly-rename(%pd2, ct=%d)\n",
+		dentry, d_count(dentry));
 	nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
 
 	/*
@@ -522,8 +521,8 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 				SILLYNAME_FILEID_LEN, fileid,
 				SILLYNAME_COUNTER_LEN, sillycounter);
 
-		dfprintk(VFS, "NFS: trying to rename %s to %s\n",
-				dentry->d_name.name, silly);
+		dfprintk(VFS, "NFS: trying to rename %pd to %s\n",
+				dentry, silly);
 
 		sdentry = lookup_one_len(silly, dentry->d_parent, slen);
 		/*
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ac1dc331ba31..c1d548211c31 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -954,10 +954,8 @@ int nfs_updatepage(struct file *file, struct page *page,
 
 	nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
 
-	dprintk("NFS:       nfs_updatepage(%s/%s %d@%lld)\n",
-		file->f_path.dentry->d_parent->d_name.name,
-		file->f_path.dentry->d_name.name, count,
-		(long long)(page_file_offset(page) + offset));
+	dprintk("NFS:       nfs_updatepage(%pD2 %d@%lld)\n",
+		file, count, (long long)(page_file_offset(page) + offset));
 
 	if (nfs_can_extend_write(file, page, inode)) {
 		count = max(count + offset, nfs_page_length(page));
-- 
cgit v1.3


From a6a9f18f0a9f943ada095753bdc4346aee67b1aa Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 16 Sep 2013 10:57:01 -0400
Subject: nfsd: switch to %p[dD]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/nfs4recover.c | 12 ++++++------
 fs/nfsd/nfs4state.c   | 15 ++++++---------
 fs/nfsd/nfsfh.c       | 28 +++++++++++++---------------
 fs/nfsd/nfsfh.h       |  4 ++--
 fs/nfsd/vfs.c         |  9 ++++-----
 5 files changed, 31 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index e0a65a9e37e9..9c271f42604a 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -385,8 +385,8 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
 
 	status = vfs_rmdir(parent->d_inode, child);
 	if (status)
-		printk("failed to remove client recovery directory %s\n",
-				child->d_name.name);
+		printk("failed to remove client recovery directory %pd\n",
+				child);
 	/* Keep trying, success or failure: */
 	return 0;
 }
@@ -410,15 +410,15 @@ out:
 	nfs4_release_reclaim(nn);
 	if (status)
 		printk("nfsd4: failed to purge old clients from recovery"
-			" directory %s\n", nn->rec_file->f_path.dentry->d_name.name);
+			" directory %pD\n", nn->rec_file);
 }
 
 static int
 load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
 {
 	if (child->d_name.len != HEXDIR_LEN - 1) {
-		printk("nfsd4: illegal name %s in recovery directory\n",
-				child->d_name.name);
+		printk("nfsd4: illegal name %pd in recovery directory\n",
+				child);
 		/* Keep trying; maybe the others are OK: */
 		return 0;
 	}
@@ -437,7 +437,7 @@ nfsd4_recdir_load(struct net *net) {
 	status = nfsd4_list_rec_dir(load_recdir, nn);
 	if (status)
 		printk("nfsd4: failed loading clients from recovery"
-			" directory %s\n", nn->rec_file->f_path.dentry->d_name.name);
+			" directory %pD\n", nn->rec_file);
 	return status;
 }
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0874998a49cd..a601fd49f997 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3843,9 +3843,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct nfs4_ol_stateid *stp;
 	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 
-	dprintk("NFSD: nfsd4_open_confirm on file %.*s\n",
-			(int)cstate->current_fh.fh_dentry->d_name.len,
-			cstate->current_fh.fh_dentry->d_name.name);
+	dprintk("NFSD: nfsd4_open_confirm on file %pd\n",
+			cstate->current_fh.fh_dentry);
 
 	status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0);
 	if (status)
@@ -3922,9 +3921,8 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 	struct nfs4_ol_stateid *stp;
 	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 
-	dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n", 
-			(int)cstate->current_fh.fh_dentry->d_name.len,
-			cstate->current_fh.fh_dentry->d_name.name);
+	dprintk("NFSD: nfsd4_open_downgrade on file %pd\n", 
+			cstate->current_fh.fh_dentry);
 
 	/* We don't yet support WANT bits: */
 	if (od->od_deleg_want)
@@ -3980,9 +3978,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct net *net = SVC_NET(rqstp);
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
-	dprintk("NFSD: nfsd4_close on file %.*s\n", 
-			(int)cstate->current_fh.fh_dentry->d_name.len,
-			cstate->current_fh.fh_dentry->d_name.name);
+	dprintk("NFSD: nfsd4_close on file %pd\n", 
+			cstate->current_fh.fh_dentry);
 
 	nfs4_lock_state();
 	status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 814afaa4458a..3d0e15ae6f72 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -47,7 +47,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
 		tdentry = parent;
 	}
 	if (tdentry != exp->ex_path.dentry)
-		dprintk("nfsd_acceptable failed at %p %s\n", tdentry, tdentry->d_name.name);
+		dprintk("nfsd_acceptable failed at %p %pd\n", tdentry, tdentry);
 	rv = (tdentry == exp->ex_path.dentry);
 	dput(tdentry);
 	return rv;
@@ -253,8 +253,8 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
 
 	if (S_ISDIR(dentry->d_inode->i_mode) &&
 			(dentry->d_flags & DCACHE_DISCONNECTED)) {
-		printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %s/%s\n",
-				dentry->d_parent->d_name.name, dentry->d_name.name);
+		printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %pd2\n",
+				dentry);
 	}
 
 	fhp->fh_dentry = dentry;
@@ -361,10 +361,9 @@ skip_pseudoflavor_check:
 	error = nfsd_permission(rqstp, exp, dentry, access);
 
 	if (error) {
-		dprintk("fh_verify: %s/%s permission failure, "
+		dprintk("fh_verify: %pd2 permission failure, "
 			"acc=%x, error=%d\n",
-			dentry->d_parent->d_name.name,
-			dentry->d_name.name,
+			dentry,
 			access, ntohl(error));
 	}
 out:
@@ -514,14 +513,13 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
 	 */
 
 	struct inode * inode = dentry->d_inode;
-	struct dentry *parent = dentry->d_parent;
 	__u32 *datap;
 	dev_t ex_dev = exp_sb(exp)->s_dev;
 
-	dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %s/%s, ino=%ld)\n",
+	dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %pd2, ino=%ld)\n",
 		MAJOR(ex_dev), MINOR(ex_dev),
 		(long) exp->ex_path.dentry->d_inode->i_ino,
-		parent->d_name.name, dentry->d_name.name,
+		dentry,
 		(inode ? inode->i_ino : 0));
 
 	/* Choose filehandle version and fsid type based on
@@ -534,13 +532,13 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
 		fh_put(ref_fh);
 
 	if (fhp->fh_locked || fhp->fh_dentry) {
-		printk(KERN_ERR "fh_compose: fh %s/%s not initialized!\n",
-		       parent->d_name.name, dentry->d_name.name);
+		printk(KERN_ERR "fh_compose: fh %pd2 not initialized!\n",
+		       dentry);
 	}
 	if (fhp->fh_maxsize < NFS_FHSIZE)
-		printk(KERN_ERR "fh_compose: called with maxsize %d! %s/%s\n",
+		printk(KERN_ERR "fh_compose: called with maxsize %d! %pd2\n",
 		       fhp->fh_maxsize,
-		       parent->d_name.name, dentry->d_name.name);
+		       dentry);
 
 	fhp->fh_dentry = dget(dentry); /* our internal copy */
 	fhp->fh_export = exp;
@@ -613,8 +611,8 @@ out_bad:
 	printk(KERN_ERR "fh_update: fh not verified!\n");
 	goto out;
 out_negative:
-	printk(KERN_ERR "fh_update: %s/%s still negative!\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name);
+	printk(KERN_ERR "fh_update: %pd2 still negative!\n",
+		dentry);
 	goto out;
 }
 
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index e5e6707ba687..4775bc4896c8 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -173,8 +173,8 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
 	BUG_ON(!dentry);
 
 	if (fhp->fh_locked) {
-		printk(KERN_WARNING "fh_lock: %s/%s already locked!\n",
-			dentry->d_parent->d_name.name, dentry->d_name.name);
+		printk(KERN_WARNING "fh_lock: %pd2 already locked!\n",
+			dentry);
 		return;
 	}
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c827acb0e943..13886f7f40d5 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1317,9 +1317,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		if (!fhp->fh_locked) {
 			/* not actually possible */
 			printk(KERN_ERR
-				"nfsd_create: parent %s/%s not locked!\n",
-				dentry->d_parent->d_name.name,
-				dentry->d_name.name);
+				"nfsd_create: parent %pd2 not locked!\n",
+				dentry);
 			err = nfserr_io;
 			goto out;
 		}
@@ -1329,8 +1328,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	 */
 	err = nfserr_exist;
 	if (dchild->d_inode) {
-		dprintk("nfsd_create: dentry %s/%s not negative!\n",
-			dentry->d_name.name, dchild->d_name.name);
+		dprintk("nfsd_create: dentry %pd/%pd not negative!\n",
+			dentry, dchild);
 		goto out; 
 	}
 
-- 
cgit v1.3


From 4cb2a01d8c4bd18a0bb292f32e9876b0c4461e12 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 16 Sep 2013 10:58:53 -0400
Subject: ubifs: switch to %pd

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ubifs/dir.c     | 41 +++++++++++++++++++----------------------
 fs/ubifs/journal.c |  6 ++----
 fs/ubifs/xattr.c   | 16 ++++++++--------
 3 files changed, 29 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 6b4947f75af7..ea41649e4ca5 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -192,8 +192,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
 	struct ubifs_dent_node *dent;
 	struct ubifs_info *c = dir->i_sb->s_fs_info;
 
-	dbg_gen("'%.*s' in dir ino %lu",
-		dentry->d_name.len, dentry->d_name.name, dir->i_ino);
+	dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino);
 
 	if (dentry->d_name.len > UBIFS_MAX_NLEN)
 		return ERR_PTR(-ENAMETOOLONG);
@@ -225,8 +224,8 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
 		 * checking.
 		 */
 		err = PTR_ERR(inode);
-		ubifs_err("dead directory entry '%.*s', error %d",
-			  dentry->d_name.len, dentry->d_name.name, err);
+		ubifs_err("dead directory entry '%pd', error %d",
+			  dentry, err);
 		ubifs_ro_mode(c, err);
 		goto out;
 	}
@@ -260,8 +259,8 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	 * parent directory inode.
 	 */
 
-	dbg_gen("dent '%.*s', mode %#hx in dir ino %lu",
-		dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
+	dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
+		dentry, mode, dir->i_ino);
 
 	err = ubifs_budget_space(c, &req);
 	if (err)
@@ -509,8 +508,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 	 * changing the parent inode.
 	 */
 
-	dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu",
-		dentry->d_name.len, dentry->d_name.name, inode->i_ino,
+	dbg_gen("dent '%pd' to ino %lu (nlink %d) in dir ino %lu",
+		dentry, inode->i_ino,
 		inode->i_nlink, dir->i_ino);
 	ubifs_assert(mutex_is_locked(&dir->i_mutex));
 	ubifs_assert(mutex_is_locked(&inode->i_mutex));
@@ -566,8 +565,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
 	 * deletions.
 	 */
 
-	dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu",
-		dentry->d_name.len, dentry->d_name.name, inode->i_ino,
+	dbg_gen("dent '%pd' from ino %lu (nlink %d) in dir ino %lu",
+		dentry, inode->i_ino,
 		inode->i_nlink, dir->i_ino);
 	ubifs_assert(mutex_is_locked(&dir->i_mutex));
 	ubifs_assert(mutex_is_locked(&inode->i_mutex));
@@ -656,8 +655,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
 	 * because we have extra space reserved for deletions.
 	 */
 
-	dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len,
-		dentry->d_name.name, inode->i_ino, dir->i_ino);
+	dbg_gen("directory '%pd', ino %lu in dir ino %lu", dentry,
+		inode->i_ino, dir->i_ino);
 	ubifs_assert(mutex_is_locked(&dir->i_mutex));
 	ubifs_assert(mutex_is_locked(&inode->i_mutex));
 	err = check_dir_empty(c, dentry->d_inode);
@@ -716,8 +715,8 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	 * directory inode.
 	 */
 
-	dbg_gen("dent '%.*s', mode %#hx in dir ino %lu",
-		dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
+	dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
+		dentry, mode, dir->i_ino);
 
 	err = ubifs_budget_space(c, &req);
 	if (err)
@@ -778,8 +777,7 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
 	 * directory inode.
 	 */
 
-	dbg_gen("dent '%.*s' in dir ino %lu",
-		dentry->d_name.len, dentry->d_name.name, dir->i_ino);
+	dbg_gen("dent '%pd' in dir ino %lu", dentry, dir->i_ino);
 
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
@@ -853,8 +851,8 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
 	 * directory inode.
 	 */
 
-	dbg_gen("dent '%.*s', target '%s' in dir ino %lu", dentry->d_name.len,
-		dentry->d_name.name, symname, dir->i_ino);
+	dbg_gen("dent '%pd', target '%s' in dir ino %lu", dentry,
+		symname, dir->i_ino);
 
 	if (len > UBIFS_MAX_INO_DATA)
 		return -ENAMETOOLONG;
@@ -979,10 +977,9 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 * separately.
 	 */
 
-	dbg_gen("dent '%.*s' ino %lu in dir ino %lu to dent '%.*s' in dir ino %lu",
-		old_dentry->d_name.len, old_dentry->d_name.name,
-		old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len,
-		new_dentry->d_name.name, new_dir->i_ino);
+	dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu",
+		old_dentry, old_inode->i_ino, old_dir->i_ino,
+		new_dentry, new_dir->i_ino);
 	ubifs_assert(mutex_is_locked(&old_dir->i_mutex));
 	ubifs_assert(mutex_is_locked(&new_dir->i_mutex));
 	if (unlink)
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index afaad07f3b29..0e045e75abd8 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -933,10 +933,8 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
 	int move = (old_dir != new_dir);
 	struct ubifs_inode *uninitialized_var(new_ui);
 
-	dbg_jnl("dent '%.*s' in dir ino %lu to dent '%.*s' in dir ino %lu",
-		old_dentry->d_name.len, old_dentry->d_name.name,
-		old_dir->i_ino, new_dentry->d_name.len,
-		new_dentry->d_name.name, new_dir->i_ino);
+	dbg_jnl("dent '%pd' in dir ino %lu to dent '%pd' in dir ino %lu",
+		old_dentry, old_dir->i_ino, new_dentry, new_dir->i_ino);
 	ubifs_assert(ubifs_inode(old_dir)->data_len == 0);
 	ubifs_assert(ubifs_inode(new_dir)->data_len == 0);
 	ubifs_assert(mutex_is_locked(&ubifs_inode(old_dir)->ui_mutex));
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 0f7139bdb2c2..5e0a63b1b0d5 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -303,8 +303,8 @@ int ubifs_setxattr(struct dentry *dentry, const char *name,
 	union ubifs_key key;
 	int err, type;
 
-	dbg_gen("xattr '%s', host ino %lu ('%.*s'), size %zd", name,
-		host->i_ino, dentry->d_name.len, dentry->d_name.name, size);
+	dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", name,
+		host->i_ino, dentry, size);
 	ubifs_assert(mutex_is_locked(&host->i_mutex));
 
 	if (size > UBIFS_MAX_INO_DATA)
@@ -367,8 +367,8 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
 	union ubifs_key key;
 	int err;
 
-	dbg_gen("xattr '%s', ino %lu ('%.*s'), buf size %zd", name,
-		host->i_ino, dentry->d_name.len, dentry->d_name.name, size);
+	dbg_gen("xattr '%s', ino %lu ('%pd'), buf size %zd", name,
+		host->i_ino, dentry, size);
 
 	err = check_namespace(&nm);
 	if (err < 0)
@@ -426,8 +426,8 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 	int err, len, written = 0;
 	struct qstr nm = { .name = NULL };
 
-	dbg_gen("ino %lu ('%.*s'), buffer size %zd", host->i_ino,
-		dentry->d_name.len, dentry->d_name.name, size);
+	dbg_gen("ino %lu ('%pd'), buffer size %zd", host->i_ino,
+		dentry, size);
 
 	len = host_ui->xattr_names + host_ui->xattr_cnt;
 	if (!buffer)
@@ -529,8 +529,8 @@ int ubifs_removexattr(struct dentry *dentry, const char *name)
 	union ubifs_key key;
 	int err;
 
-	dbg_gen("xattr '%s', ino %lu ('%.*s')", name,
-		host->i_ino, dentry->d_name.len, dentry->d_name.name);
+	dbg_gen("xattr '%s', ino %lu ('%pd')", name,
+		host->i_ino, dentry);
 	ubifs_assert(mutex_is_locked(&host->i_mutex));
 
 	err = check_namespace(&nm);
-- 
cgit v1.3


From 84eb3532b574bb6a65c0aad9591b28b2adcd47d2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 16 Sep 2013 10:59:55 -0400
Subject: ncpfs: switch to %p[dD]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ncpfs/dir.c  | 55 ++++++++++++++++++++-----------------------------------
 fs/ncpfs/file.c | 12 ++++--------
 2 files changed, 24 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 3be047474bfc..c320ac52353e 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -339,9 +339,8 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
 	if (val)
 		goto finished;
 
-	DDPRINTK("ncp_lookup_validate: %s/%s not valid, age=%ld, server lookup\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name,
-		NCP_GET_AGE(dentry));
+	DDPRINTK("ncp_lookup_validate: %pd2 not valid, age=%ld, server lookup\n",
+		dentry, NCP_GET_AGE(dentry));
 
 	len = sizeof(__name);
 	if (ncp_is_server_root(dir)) {
@@ -359,8 +358,8 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
 			res = ncp_obtain_info(server, dir, __name, &(finfo.i));
 	}
 	finfo.volume = finfo.i.volNumber;
-	DDPRINTK("ncp_lookup_validate: looked for %s/%s, res=%d\n",
-		dentry->d_parent->d_name.name, __name, res);
+	DDPRINTK("ncp_lookup_validate: looked for %pd/%s, res=%d\n",
+		dentry->d_parent, __name, res);
 	/*
 	 * If we didn't find it, or if it has a different dirEntNum to
 	 * what we remember, it's not valid any more.
@@ -454,8 +453,7 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx)
 	ctl.page  = NULL;
 	ctl.cache = NULL;
 
-	DDPRINTK("ncp_readdir: reading %s/%s, pos=%d\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name,
+	DDPRINTK("ncp_readdir: reading %pD2, pos=%d\n", file,
 		(int) ctx->pos);
 
 	result = -EIO;
@@ -740,12 +738,10 @@ ncp_do_readdir(struct file *file, struct dir_context *ctx,
 	int more;
 	size_t bufsize;
 
-	DPRINTK("ncp_do_readdir: %s/%s, fpos=%ld\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name,
+	DPRINTK("ncp_do_readdir: %pD2, fpos=%ld\n", file,
 		(unsigned long) ctx->pos);
-	PPRINTK("ncp_do_readdir: init %s, volnum=%d, dirent=%u\n",
-		dentry->d_name.name, NCP_FINFO(dir)->volNumber,
-		NCP_FINFO(dir)->dirEntNum);
+	PPRINTK("ncp_do_readdir: init %pD, volnum=%d, dirent=%u\n",
+		file, NCP_FINFO(dir)->volNumber, NCP_FINFO(dir)->dirEntNum);
 
 	err = ncp_initialize_search(server, dir, &seq);
 	if (err) {
@@ -850,8 +846,7 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsig
 	if (!ncp_conn_valid(server))
 		goto finished;
 
-	PPRINTK("ncp_lookup: server lookup for %s/%s\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name);
+	PPRINTK("ncp_lookup: server lookup for %pd2\n", dentry);
 
 	len = sizeof(__name);
 	if (ncp_is_server_root(dir)) {
@@ -867,8 +862,7 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsig
 		if (!res)
 			res = ncp_obtain_info(server, dir, __name, &(finfo.i));
 	}
-	PPRINTK("ncp_lookup: looked for %s/%s, res=%d\n",
-		dentry->d_parent->d_name.name, __name, res);
+	PPRINTK("ncp_lookup: looked for %pd2, res=%d\n", dentry, res);
 	/*
 	 * If we didn't find an entry, make a negative dentry.
 	 */
@@ -915,8 +909,7 @@ out:
 	return error;
 
 out_close:
-	PPRINTK("ncp_instantiate: %s/%s failed, closing file\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name);
+	PPRINTK("ncp_instantiate: %pd2 failed, closing file\n", dentry);
 	ncp_close_file(NCP_SERVER(dir), finfo->file_handle);
 	goto out;
 }
@@ -930,8 +923,7 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, umode_t mode,
 	int opmode;
 	__u8 __name[NCP_MAXPATHLEN + 1];
 	
-	PPRINTK("ncp_create_new: creating %s/%s, mode=%hx\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name, mode);
+	PPRINTK("ncp_create_new: creating %pd2, mode=%hx\n", dentry, mode);
 
 	ncp_age_dentry(server, dentry);
 	len = sizeof(__name);
@@ -960,8 +952,7 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, umode_t mode,
 				error = -ENAMETOOLONG;
 			else if (result < 0)
 				error = result;
-			DPRINTK("ncp_create: %s/%s failed\n",
-				dentry->d_parent->d_name.name, dentry->d_name.name);
+			DPRINTK("ncp_create: %pd2 failed\n", dentry);
 			goto out;
 		}
 		opmode = O_WRONLY;
@@ -994,8 +985,7 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	int error, len;
 	__u8 __name[NCP_MAXPATHLEN + 1];
 
-	DPRINTK("ncp_mkdir: making %s/%s\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name);
+	DPRINTK("ncp_mkdir: making %pd2\n", dentry);
 
 	ncp_age_dentry(server, dentry);
 	len = sizeof(__name);
@@ -1032,8 +1022,7 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
 	int error, result, len;
 	__u8 __name[NCP_MAXPATHLEN + 1];
 
-	DPRINTK("ncp_rmdir: removing %s/%s\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name);
+	DPRINTK("ncp_rmdir: removing %pd2\n", dentry);
 
 	len = sizeof(__name);
 	error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
@@ -1078,8 +1067,7 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
 	int error;
 
 	server = NCP_SERVER(dir);
-	DPRINTK("ncp_unlink: unlinking %s/%s\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name);
+	DPRINTK("ncp_unlink: unlinking %pd2\n", dentry);
 	
 	/*
 	 * Check whether to close the file ...
@@ -1099,8 +1087,7 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
 #endif
 	switch (error) {
 		case 0x00:
-			DPRINTK("ncp: removed %s/%s\n",
-				dentry->d_parent->d_name.name, dentry->d_name.name);
+			DPRINTK("ncp: removed %pd2\n", dentry);
 			break;
 		case 0x85:
 		case 0x8A:
@@ -1133,9 +1120,7 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int old_len, new_len;
 	__u8 __old_name[NCP_MAXPATHLEN + 1], __new_name[NCP_MAXPATHLEN + 1];
 
-	DPRINTK("ncp_rename: %s/%s to %s/%s\n",
-		old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
-		new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
+	DPRINTK("ncp_rename: %pd2 to %pd2\n", old_dentry, new_dentry);
 
 	ncp_age_dentry(server, old_dentry);
 	ncp_age_dentry(server, new_dentry);
@@ -1165,8 +1150,8 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
 #endif
 	switch (error) {
 		case 0x00:
-               	        DPRINTK("ncp renamed %s -> %s.\n",
-                                old_dentry->d_name.name,new_dentry->d_name.name);
+               	        DPRINTK("ncp renamed %pd -> %pd.\n",
+                                old_dentry, new_dentry);
 			break;
 		case 0x9E:
 			error = -ENAMETOOLONG;
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 122e260247f5..8f5074e1ecb9 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -107,8 +107,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	void* freepage;
 	size_t freelen;
 
-	DPRINTK("ncp_file_read: enter %s/%s\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name);
+	DPRINTK("ncp_file_read: enter %pd2\n", dentry);
 
 	pos = *ppos;
 
@@ -166,8 +165,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 
 	file_accessed(file);
 
-	DPRINTK("ncp_file_read: exit %s/%s\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name);
+	DPRINTK("ncp_file_read: exit %pd2\n", dentry);
 outrel:
 	ncp_inode_close(inode);		
 	return already_read ? already_read : error;
@@ -184,8 +182,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
 	int errno;
 	void* bouncebuffer;
 
-	DPRINTK("ncp_file_write: enter %s/%s\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name);
+	DPRINTK("ncp_file_write: enter %pd2\n", dentry);
 	if ((ssize_t) count < 0)
 		return -EINVAL;
 	pos = *ppos;
@@ -264,8 +261,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
 			i_size_write(inode, pos);
 		mutex_unlock(&inode->i_mutex);
 	}
-	DPRINTK("ncp_file_write: exit %s/%s\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name);
+	DPRINTK("ncp_file_write: exit %pd2\n", dentry);
 outrel:
 	ncp_inode_close(inode);		
 	return already_written ? already_written : errno;
-- 
cgit v1.3


From 7b00ed6fe6321547b9d51f5bfb30e7b5932e6889 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 16 Sep 2013 21:19:20 -0400
Subject: put_mnt_ns(): use drop_collected_mounts()

... rather than open-coding it

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index da5c49483430..5918fc31a636 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2802,11 +2802,7 @@ void put_mnt_ns(struct mnt_namespace *ns)
 {
 	if (!atomic_dec_and_test(&ns->count))
 		return;
-	namespace_lock();
-	br_write_lock(&vfsmount_lock);
-	umount_tree(ns->root, 0);
-	br_write_unlock(&vfsmount_lock);
-	namespace_unlock();
+	drop_collected_mounts(&ns->root->mnt);
 	free_mnt_ns(ns);
 }
 
-- 
cgit v1.3


From e84f9e57b90ca89664d733a7cef19aa7ccd832f3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 22 Sep 2013 14:17:15 -0400
Subject: consolidate the reassignments of ->f_op in ->open() instances

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/char/misc.c             | 12 +++---------
 drivers/gpu/drm/drm_fops.c      | 17 ++++++-----------
 drivers/media/dvb-core/dvbdev.c | 19 ++++++-------------
 drivers/usb/core/file.c         | 16 ++++------------
 fs/char_dev.c                   |  6 ++++--
 include/linux/fs.h              | 11 +++++++++++
 sound/core/sound.c              | 22 ++++++----------------
 sound/sound_core.c              | 17 +++--------------
 8 files changed, 43 insertions(+), 77 deletions(-)

(limited to 'fs')

diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index 190d4423653f..fd0f7c4b4df9 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -114,7 +114,7 @@ static int misc_open(struct inode * inode, struct file * file)
 	int minor = iminor(inode);
 	struct miscdevice *c;
 	int err = -ENODEV;
-	const struct file_operations *old_fops, *new_fops = NULL;
+	const struct file_operations *new_fops = NULL;
 
 	mutex_lock(&misc_mtx);
 	
@@ -141,17 +141,11 @@ static int misc_open(struct inode * inode, struct file * file)
 	}
 
 	err = 0;
-	old_fops = file->f_op;
-	file->f_op = new_fops;
+	replace_fops(file, new_fops);
 	if (file->f_op->open) {
 		file->private_data = c;
-		err=file->f_op->open(inode,file);
-		if (err) {
-			fops_put(file->f_op);
-			file->f_op = fops_get(old_fops);
-		}
+		err = file->f_op->open(inode,file);
 	}
-	fops_put(old_fops);
 fail:
 	mutex_unlock(&misc_mtx);
 	return err;
diff --git a/drivers/gpu/drm/drm_fops.c b/drivers/gpu/drm/drm_fops.c
index 3f84277d7036..22d14ecbd3ec 100644
--- a/drivers/gpu/drm/drm_fops.c
+++ b/drivers/gpu/drm/drm_fops.c
@@ -148,7 +148,7 @@ int drm_stub_open(struct inode *inode, struct file *filp)
 	struct drm_minor *minor;
 	int minor_id = iminor(inode);
 	int err = -ENODEV;
-	const struct file_operations *old_fops;
+	const struct file_operations *new_fops;
 
 	DRM_DEBUG("\n");
 
@@ -163,18 +163,13 @@ int drm_stub_open(struct inode *inode, struct file *filp)
 	if (drm_device_is_unplugged(dev))
 		goto out;
 
-	old_fops = filp->f_op;
-	filp->f_op = fops_get(dev->driver->fops);
-	if (filp->f_op == NULL) {
-		filp->f_op = old_fops;
+	new_fops = fops_get(dev->driver->fops);
+	if (!new_fops)
 		goto out;
-	}
-	if (filp->f_op->open && (err = filp->f_op->open(inode, filp))) {
-		fops_put(filp->f_op);
-		filp->f_op = fops_get(old_fops);
-	}
-	fops_put(old_fops);
 
+	replace_fops(filp, new_fops);
+	if (filp->f_op->open)
+		err = filp->f_op->open(inode, filp);
 out:
 	mutex_unlock(&drm_global_mutex);
 	return err;
diff --git a/drivers/media/dvb-core/dvbdev.c b/drivers/media/dvb-core/dvbdev.c
index 401ef64f92c6..983db75de350 100644
--- a/drivers/media/dvb-core/dvbdev.c
+++ b/drivers/media/dvb-core/dvbdev.c
@@ -74,22 +74,15 @@ static int dvb_device_open(struct inode *inode, struct file *file)
 
 	if (dvbdev && dvbdev->fops) {
 		int err = 0;
-		const struct file_operations *old_fops;
+		const struct file_operations *new_fops;
 
-		file->private_data = dvbdev;
-		old_fops = file->f_op;
-		file->f_op = fops_get(dvbdev->fops);
-		if (file->f_op == NULL) {
-			file->f_op = old_fops;
+		new_fops = fops_get(dvbdev->fops);
+		if (!new_fops)
 			goto fail;
-		}
-		if(file->f_op->open)
+		file->private_data = dvbdev;
+		replace_fops(file, new_fops);
+		if (file->f_op->open)
 			err = file->f_op->open(inode,file);
-		if (err) {
-			fops_put(file->f_op);
-			file->f_op = fops_get(old_fops);
-		}
-		fops_put(old_fops);
 		up_read(&minor_rwsem);
 		mutex_unlock(&dvbdev_mutex);
 		return err;
diff --git a/drivers/usb/core/file.c b/drivers/usb/core/file.c
index 7421888087a3..fe0d8365411a 100644
--- a/drivers/usb/core/file.c
+++ b/drivers/usb/core/file.c
@@ -29,27 +29,19 @@ static DECLARE_RWSEM(minor_rwsem);
 
 static int usb_open(struct inode * inode, struct file * file)
 {
-	int minor = iminor(inode);
-	const struct file_operations *c;
 	int err = -ENODEV;
-	const struct file_operations *old_fops, *new_fops = NULL;
+	const struct file_operations *new_fops;
 
 	down_read(&minor_rwsem);
-	c = usb_minors[minor];
+	new_fops = fops_get(usb_minors[iminor(inode)]);
 
-	if (!c || !(new_fops = fops_get(c)))
+	if (!new_fops)
 		goto done;
 
-	old_fops = file->f_op;
-	file->f_op = new_fops;
+	replace_fops(file, new_fops);
 	/* Curiouser and curiouser... NULL ->open() as "no device" ? */
 	if (file->f_op->open)
 		err = file->f_op->open(inode,file);
-	if (err) {
-		fops_put(file->f_op);
-		file->f_op = fops_get(old_fops);
-	}
-	fops_put(old_fops);
  done:
 	up_read(&minor_rwsem);
 	return err;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index afc2bb691780..94b5f60076da 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -368,6 +368,7 @@ void cdev_put(struct cdev *p)
  */
 static int chrdev_open(struct inode *inode, struct file *filp)
 {
+	const struct file_operations *fops;
 	struct cdev *p;
 	struct cdev *new = NULL;
 	int ret = 0;
@@ -400,10 +401,11 @@ static int chrdev_open(struct inode *inode, struct file *filp)
 		return ret;
 
 	ret = -ENXIO;
-	filp->f_op = fops_get(p->ops);
-	if (!filp->f_op)
+	fops = fops_get(p->ops);
+	if (!fops)
 		goto out_cdev_put;
 
+	replace_fops(filp, fops);
 	if (filp->f_op->open) {
 		ret = filp->f_op->open(inode, filp);
 		if (ret)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d80c2437d624..b09e4e1d747a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1875,6 +1875,17 @@ extern struct dentry *mount_pseudo(struct file_system_type *, char *,
 	(((fops) && try_module_get((fops)->owner) ? (fops) : NULL))
 #define fops_put(fops) \
 	do { if (fops) module_put((fops)->owner); } while(0)
+/*
+ * This one is to be used *ONLY* from ->open() instances.
+ * fops must be non-NULL, pinned down *and* module dependencies
+ * should be sufficient to pin the caller down as well.
+ */
+#define replace_fops(f, fops) \
+	do {	\
+		struct file *__file = (f); \
+		fops_put(__file->f_op); \
+		BUG_ON(!(__file->f_op = (fops))); \
+	} while(0)
 
 extern int register_filesystem(struct file_system_type *);
 extern int unregister_filesystem(struct file_system_type *);
diff --git a/sound/core/sound.c b/sound/core/sound.c
index f002bd911dae..437c25ea6403 100644
--- a/sound/core/sound.c
+++ b/sound/core/sound.c
@@ -153,7 +153,7 @@ static int snd_open(struct inode *inode, struct file *file)
 {
 	unsigned int minor = iminor(inode);
 	struct snd_minor *mptr = NULL;
-	const struct file_operations *old_fops;
+	const struct file_operations *new_fops;
 	int err = 0;
 
 	if (minor >= ARRAY_SIZE(snd_minors))
@@ -167,24 +167,14 @@ static int snd_open(struct inode *inode, struct file *file)
 			return -ENODEV;
 		}
 	}
-	old_fops = file->f_op;
-	file->f_op = fops_get(mptr->f_ops);
-	if (file->f_op == NULL) {
-		file->f_op = old_fops;
-		err = -ENODEV;
-	}
+	new_fops = fops_get(mptr->f_ops);
 	mutex_unlock(&sound_mutex);
-	if (err < 0)
-		return err;
+	if (!new_fops)
+		return -ENODEV;
+	replace_fops(file, new_fops);
 
-	if (file->f_op->open) {
+	if (file->f_op->open)
 		err = file->f_op->open(inode, file);
-		if (err) {
-			fops_put(file->f_op);
-			file->f_op = fops_get(old_fops);
-		}
-	}
-	fops_put(old_fops);
 	return err;
 }
 
diff --git a/sound/sound_core.c b/sound/sound_core.c
index 45759f4cca75..11e953a1fa45 100644
--- a/sound/sound_core.c
+++ b/sound/sound_core.c
@@ -626,31 +626,20 @@ static int soundcore_open(struct inode *inode, struct file *file)
 		if (s)
 			new_fops = fops_get(s->unit_fops);
 	}
+	spin_unlock(&sound_loader_lock);
 	if (new_fops) {
 		/*
 		 * We rely upon the fact that we can't be unloaded while the
-		 * subdriver is there, so if ->open() is successful we can
-		 * safely drop the reference counter and if it is not we can
-		 * revert to old ->f_op. Ugly, indeed, but that's the cost of
-		 * switching ->f_op in the first place.
+		 * subdriver is there.
 		 */
 		int err = 0;
-		const struct file_operations *old_fops = file->f_op;
-		file->f_op = new_fops;
-		spin_unlock(&sound_loader_lock);
+		replace_fops(file, new_fops);
 
 		if (file->f_op->open)
 			err = file->f_op->open(inode,file);
 
-		if (err) {
-			fops_put(file->f_op);
-			file->f_op = fops_get(old_fops);
-		}
-
-		fops_put(old_fops);
 		return err;
 	}
-	spin_unlock(&sound_loader_lock);
 	return -ENODEV;
 }
 
-- 
cgit v1.3


From 72c2d53192004845cbc19cd8a30b3212a9288140 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 22 Sep 2013 16:27:52 -0400
Subject: file->f_op is never NULL...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/mtd/nand/nandsim.c               |  2 +-
 drivers/staging/comedi/comedi_compat32.c |  3 ---
 fs/autofs4/autofs_i.h                    |  2 +-
 fs/autofs4/dev-ioctl.c                   |  6 ------
 fs/binfmt_aout.c                         |  4 ++--
 fs/binfmt_elf.c                          |  6 +++---
 fs/binfmt_elf_fdpic.c                    |  2 +-
 fs/binfmt_em86.c                         |  2 +-
 fs/coda/dir.c                            |  3 ---
 fs/coda/file.c                           |  6 +++---
 fs/compat_ioctl.c                        |  4 ++--
 fs/coredump.c                            |  2 +-
 fs/ecryptfs/file.c                       |  8 ++++----
 fs/eventpoll.c                           |  2 +-
 fs/exec.c                                | 35 ++++++++++++++------------------
 fs/fcntl.c                               |  5 ++---
 fs/file_table.c                          |  4 ++--
 fs/ioctl.c                               |  4 ++--
 fs/locks.c                               | 12 +++++------
 fs/open.c                                |  8 ++++++--
 fs/read_write.c                          | 25 ++++++++---------------
 fs/readdir.c                             |  2 +-
 fs/select.c                              |  4 ++--
 fs/splice.c                              |  6 +++---
 fs/sync.c                                |  2 +-
 mm/memory.c                              |  2 +-
 mm/mmap.c                                |  4 ++--
 mm/nommu.c                               |  2 +-
 net/9p/trans_fd.c                        |  4 ++--
 29 files changed, 74 insertions(+), 97 deletions(-)

(limited to 'fs')

diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index bdc1d15369f8..d1c7de92cfdf 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -575,7 +575,7 @@ static int alloc_device(struct nandsim *ns)
 		cfile = filp_open(cache_file, O_CREAT | O_RDWR | O_LARGEFILE, 0600);
 		if (IS_ERR(cfile))
 			return PTR_ERR(cfile);
-		if (!cfile->f_op || (!cfile->f_op->read && !cfile->f_op->aio_read)) {
+		if (!cfile->f_op->read && !cfile->f_op->aio_read) {
 			NS_ERR("alloc_device: cache file not readable\n");
 			err = -EINVAL;
 			goto err_close;
diff --git a/drivers/staging/comedi/comedi_compat32.c b/drivers/staging/comedi/comedi_compat32.c
index 2dfb06aedb15..1e9da405d833 100644
--- a/drivers/staging/comedi/comedi_compat32.c
+++ b/drivers/staging/comedi/comedi_compat32.c
@@ -86,9 +86,6 @@ struct comedi32_insnlist_struct {
 static int translated_ioctl(struct file *file, unsigned int cmd,
 			    unsigned long arg)
 {
-	if (!file->f_op)
-		return -ENOTTY;
-
 	if (file->f_op->unlocked_ioctl)
 		return file->f_op->unlocked_ioctl(file, cmd, arg);
 
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 3f1128b37e46..ebab147e9700 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -271,7 +271,7 @@ void autofs4_clean_ino(struct autofs_info *);
 
 static inline int autofs_prepare_pipe(struct file *pipe)
 {
-	if (!pipe->f_op || !pipe->f_op->write)
+	if (!pipe->f_op->write)
 		return -EINVAL;
 	if (!S_ISFIFO(file_inode(pipe)->i_mode))
 		return -EINVAL;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 0f00da329e71..1818ce7f5a06 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -658,12 +658,6 @@ static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __use
 			goto out;
 		}
 
-		if (!fp->f_op) {
-			err = -ENOTTY;
-			fput(fp);
-			goto out;
-		}
-
 		sbi = autofs_dev_ioctl_sbi(fp);
 		if (!sbi || sbi->magic != AUTOFS_SBI_MAGIC) {
 			err = -EINVAL;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 89dec7f789a4..faaa819f70fa 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -221,7 +221,7 @@ static int load_aout_binary(struct linux_binprm * bprm)
 	 * Requires a mmap handler. This prevents people from using a.out
 	 * as part of an exploit attack against /proc-related vulnerabilities.
 	 */
-	if (!bprm->file->f_op || !bprm->file->f_op->mmap)
+	if (!bprm->file->f_op->mmap)
 		return -ENOEXEC;
 
 	fd_offset = N_TXTOFF(ex);
@@ -374,7 +374,7 @@ static int load_aout_library(struct file *file)
 	 * Requires a mmap handler. This prevents people from using a.out
 	 * as part of an exploit attack against /proc-related vulnerabilities.
 	 */
-	if (!file->f_op || !file->f_op->mmap)
+	if (!file->f_op->mmap)
 		goto out;
 
 	if (N_FLAGS(ex))
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 4c94a79991bb..501c8a4d6eb1 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -406,7 +406,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 		goto out;
 	if (!elf_check_arch(interp_elf_ex))
 		goto out;
-	if (!interpreter->f_op || !interpreter->f_op->mmap)
+	if (!interpreter->f_op->mmap)
 		goto out;
 
 	/*
@@ -607,7 +607,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 		goto out;
 	if (!elf_check_arch(&loc->elf_ex))
 		goto out;
-	if (!bprm->file->f_op || !bprm->file->f_op->mmap)
+	if (!bprm->file->f_op->mmap)
 		goto out;
 
 	/* Now read in all of the header information */
@@ -1028,7 +1028,7 @@ static int load_elf_library(struct file *file)
 
 	/* First of all, some simple consistency checks */
 	if (elf_ex.e_type != ET_EXEC || elf_ex.e_phnum > 2 ||
-	    !elf_check_arch(&elf_ex) || !file->f_op || !file->f_op->mmap)
+	    !elf_check_arch(&elf_ex) || !file->f_op->mmap)
 		goto out;
 
 	/* Now read in all of the header information */
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index c166f325a183..ea4c6273b4a5 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -111,7 +111,7 @@ static int is_elf_fdpic(struct elfhdr *hdr, struct file *file)
 		return 0;
 	if (!elf_check_arch(hdr) || !elf_check_fdpic(hdr))
 		return 0;
-	if (!file->f_op || !file->f_op->mmap)
+	if (!file->f_op->mmap)
 		return 0;
 	return 1;
 }
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 037a3e2b045b..f37b08cea1f7 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -38,7 +38,7 @@ static int load_em86(struct linux_binprm *bprm)
 	/* First of all, some simple consistency checks */
 	if ((elf_ex.e_type != ET_EXEC && elf_ex.e_type != ET_DYN) ||
 		(!((elf_ex.e_machine == EM_386) || (elf_ex.e_machine == EM_486))) ||
-		(!bprm->file->f_op || !bprm->file->f_op->mmap)) {
+		!bprm->file->f_op->mmap) {
 			return -ENOEXEC;
 	}
 
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 190effc6a6fa..8c8d469c9283 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -387,9 +387,6 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 	host_file = cfi->cfi_container;
 
-	if (!host_file->f_op)
-		return -ENOTDIR;
-
 	if (host_file->f_op->iterate) {
 		struct inode *host_inode = file_inode(host_file);
 		mutex_lock(&host_inode->i_mutex);
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 380b798f8443..9e83b7790212 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -36,7 +36,7 @@ coda_file_read(struct file *coda_file, char __user *buf, size_t count, loff_t *p
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 	host_file = cfi->cfi_container;
 
-	if (!host_file->f_op || !host_file->f_op->read)
+	if (!host_file->f_op->read)
 		return -EINVAL;
 
 	return host_file->f_op->read(host_file, buf, count, ppos);
@@ -75,7 +75,7 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 	host_file = cfi->cfi_container;
 
-	if (!host_file->f_op || !host_file->f_op->write)
+	if (!host_file->f_op->write)
 		return -EINVAL;
 
 	host_inode = file_inode(host_file);
@@ -105,7 +105,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 	host_file = cfi->cfi_container;
 
-	if (!host_file->f_op || !host_file->f_op->mmap)
+	if (!host_file->f_op->mmap)
 		return -ENODEV;
 
 	coda_inode = file_inode(coda_file);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 5d19acfa7c6c..dc52e13d58e0 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1583,13 +1583,13 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
 		/*FALL THROUGH*/
 
 	default:
-		if (f.file->f_op && f.file->f_op->compat_ioctl) {
+		if (f.file->f_op->compat_ioctl) {
 			error = f.file->f_op->compat_ioctl(f.file, cmd, arg);
 			if (error != -ENOIOCTLCMD)
 				goto out_fput;
 		}
 
-		if (!f.file->f_op || !f.file->f_op->unlocked_ioctl)
+		if (!f.file->f_op->unlocked_ioctl)
 			goto do_ioctl;
 		break;
 	}
diff --git a/fs/coredump.c b/fs/coredump.c
index 9bdeca12ae0e..42c3b8423669 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -645,7 +645,7 @@ void do_coredump(siginfo_t *siginfo)
 		 */
 		if (!uid_eq(inode->i_uid, current_fsuid()))
 			goto close_fail;
-		if (!cprm.file->f_op || !cprm.file->f_op->write)
+		if (!cprm.file->f_op->write)
 			goto close_fail;
 		if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
 			goto close_fail;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 992cf95830b5..6b1cd2b0c751 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -271,7 +271,7 @@ static int ecryptfs_flush(struct file *file, fl_owner_t td)
 {
 	struct file *lower_file = ecryptfs_file_to_lower(file);
 
-	if (lower_file->f_op && lower_file->f_op->flush) {
+	if (lower_file->f_op->flush) {
 		filemap_write_and_wait(file->f_mapping);
 		return lower_file->f_op->flush(lower_file, td);
 	}
@@ -305,7 +305,7 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
 	struct file *lower_file = NULL;
 
 	lower_file = ecryptfs_file_to_lower(file);
-	if (lower_file->f_op && lower_file->f_op->fasync)
+	if (lower_file->f_op->fasync)
 		rc = lower_file->f_op->fasync(fd, lower_file, flag);
 	return rc;
 }
@@ -318,7 +318,7 @@ ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	if (ecryptfs_file_to_private(file))
 		lower_file = ecryptfs_file_to_lower(file);
-	if (lower_file && lower_file->f_op && lower_file->f_op->unlocked_ioctl)
+	if (lower_file->f_op && lower_file->f_op->unlocked_ioctl)
 		rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
 	return rc;
 }
@@ -332,7 +332,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	if (ecryptfs_file_to_private(file))
 		lower_file = ecryptfs_file_to_lower(file);
-	if (lower_file && lower_file->f_op && lower_file->f_op->compat_ioctl)
+	if (lower_file->f_op && lower_file->f_op->compat_ioctl)
 		rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
 	return rc;
 }
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 473e09da7d02..31fd77cd81eb 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1816,7 +1816,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 
 	/* The target file descriptor must support poll */
 	error = -EPERM;
-	if (!tf.file->f_op || !tf.file->f_op->poll)
+	if (!tf.file->f_op->poll)
 		goto error_tgt_fput;
 
 	/* Check if EPOLLWAKEUP is allowed */
diff --git a/fs/exec.c b/fs/exec.c
index 8875dd10ae7a..351440fbf1ee 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -106,6 +106,7 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
  */
 SYSCALL_DEFINE1(uselib, const char __user *, library)
 {
+	struct linux_binfmt *fmt;
 	struct file *file;
 	struct filename *tmp = getname(library);
 	int error = PTR_ERR(tmp);
@@ -136,24 +137,21 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 	fsnotify_open(file);
 
 	error = -ENOEXEC;
-	if(file->f_op) {
-		struct linux_binfmt * fmt;
 
-		read_lock(&binfmt_lock);
-		list_for_each_entry(fmt, &formats, lh) {
-			if (!fmt->load_shlib)
-				continue;
-			if (!try_module_get(fmt->module))
-				continue;
-			read_unlock(&binfmt_lock);
-			error = fmt->load_shlib(file);
-			read_lock(&binfmt_lock);
-			put_binfmt(fmt);
-			if (error != -ENOEXEC)
-				break;
-		}
+	read_lock(&binfmt_lock);
+	list_for_each_entry(fmt, &formats, lh) {
+		if (!fmt->load_shlib)
+			continue;
+		if (!try_module_get(fmt->module))
+			continue;
 		read_unlock(&binfmt_lock);
+		error = fmt->load_shlib(file);
+		read_lock(&binfmt_lock);
+		put_binfmt(fmt);
+		if (error != -ENOEXEC)
+			break;
 	}
+	read_unlock(&binfmt_lock);
 exit:
 	fput(file);
 out:
@@ -1277,13 +1275,10 @@ static int check_unsafe_exec(struct linux_binprm *bprm)
  */
 int prepare_binprm(struct linux_binprm *bprm)
 {
-	umode_t mode;
-	struct inode * inode = file_inode(bprm->file);
+	struct inode *inode = file_inode(bprm->file);
+	umode_t mode = inode->i_mode;
 	int retval;
 
-	mode = inode->i_mode;
-	if (bprm->file->f_op == NULL)
-		return -EACCES;
 
 	/* clear any previous set[ug]id data from a previous binary */
 	bprm->cred->euid = current_euid();
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 65343c3741ff..ef6866592a0f 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -56,7 +56,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 				return -EINVAL;
 	}
 
-	if (filp->f_op && filp->f_op->check_flags)
+	if (filp->f_op->check_flags)
 		error = filp->f_op->check_flags(arg);
 	if (error)
 		return error;
@@ -64,8 +64,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 	/*
 	 * ->fasync() is responsible for setting the FASYNC bit.
 	 */
-	if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op &&
-			filp->f_op->fasync) {
+	if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op->fasync) {
 		error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
 		if (error < 0)
 			goto out;
diff --git a/fs/file_table.c b/fs/file_table.c
index abdd15ad13c9..e61e5529fa9d 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -240,11 +240,11 @@ static void __fput(struct file *file)
 	locks_remove_flock(file);
 
 	if (unlikely(file->f_flags & FASYNC)) {
-		if (file->f_op && file->f_op->fasync)
+		if (file->f_op->fasync)
 			file->f_op->fasync(-1, file, 0);
 	}
 	ima_file_free(file);
-	if (file->f_op && file->f_op->release)
+	if (file->f_op->release)
 		file->f_op->release(inode, file);
 	security_file_free(file);
 	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
diff --git a/fs/ioctl.c b/fs/ioctl.c
index fd507fb460f8..8ac3fad36192 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -37,7 +37,7 @@ static long vfs_ioctl(struct file *filp, unsigned int cmd,
 {
 	int error = -ENOTTY;
 
-	if (!filp->f_op || !filp->f_op->unlocked_ioctl)
+	if (!filp->f_op->unlocked_ioctl)
 		goto out;
 
 	error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
@@ -501,7 +501,7 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp,
 
 	/* Did FASYNC state change ? */
 	if ((flag ^ filp->f_flags) & FASYNC) {
-		if (filp->f_op && filp->f_op->fasync)
+		if (filp->f_op->fasync)
 			/* fasync() adjusts filp->f_flags */
 			error = filp->f_op->fasync(fd, filp, on);
 		else
diff --git a/fs/locks.c b/fs/locks.c
index b27a3005d78d..f155ffd648bb 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1579,7 +1579,7 @@ EXPORT_SYMBOL(generic_setlease);
 
 static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
 {
-	if (filp->f_op && filp->f_op->setlease)
+	if (filp->f_op->setlease)
 		return filp->f_op->setlease(filp, arg, lease);
 	else
 		return generic_setlease(filp, arg, lease);
@@ -1771,7 +1771,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
 	if (error)
 		goto out_free;
 
-	if (f.file->f_op && f.file->f_op->flock)
+	if (f.file->f_op->flock)
 		error = f.file->f_op->flock(f.file,
 					  (can_sleep) ? F_SETLKW : F_SETLK,
 					  lock);
@@ -1797,7 +1797,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
  */
 int vfs_test_lock(struct file *filp, struct file_lock *fl)
 {
-	if (filp->f_op && filp->f_op->lock)
+	if (filp->f_op->lock)
 		return filp->f_op->lock(filp, F_GETLK, fl);
 	posix_test_lock(filp, fl);
 	return 0;
@@ -1909,7 +1909,7 @@ out:
  */
 int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
 {
-	if (filp->f_op && filp->f_op->lock)
+	if (filp->f_op->lock)
 		return filp->f_op->lock(filp, cmd, fl);
 	else
 		return posix_lock_file(filp, fl, conf);
@@ -2182,7 +2182,7 @@ void locks_remove_flock(struct file *filp)
 	if (!inode->i_flock)
 		return;
 
-	if (filp->f_op && filp->f_op->flock) {
+	if (filp->f_op->flock) {
 		struct file_lock fl = {
 			.fl_pid = current->tgid,
 			.fl_file = filp,
@@ -2246,7 +2246,7 @@ EXPORT_SYMBOL(posix_unblock_lock);
  */
 int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
 {
-	if (filp->f_op && filp->f_op->lock)
+	if (filp->f_op->lock)
 		return filp->f_op->lock(filp, F_CANCELLK, fl);
 	return 0;
 }
diff --git a/fs/open.c b/fs/open.c
index d420331ca32a..a1465b1ec8c7 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -693,6 +693,10 @@ static int do_dentry_open(struct file *f,
 	}
 
 	f->f_op = fops_get(inode->i_fop);
+	if (unlikely(WARN_ON(!f->f_op))) {
+		error = -ENODEV;
+		goto cleanup_all;
+	}
 
 	error = security_file_open(f, cred);
 	if (error)
@@ -702,7 +706,7 @@ static int do_dentry_open(struct file *f,
 	if (error)
 		goto cleanup_all;
 
-	if (!open && f->f_op)
+	if (!open)
 		open = f->f_op->open;
 	if (open) {
 		error = open(inode, f);
@@ -1023,7 +1027,7 @@ int filp_close(struct file *filp, fl_owner_t id)
 		return 0;
 	}
 
-	if (filp->f_op && filp->f_op->flush)
+	if (filp->f_op->flush)
 		retval = filp->f_op->flush(filp, id);
 
 	if (likely(!(filp->f_mode & FMODE_PATH))) {
diff --git a/fs/read_write.c b/fs/read_write.c
index e3cd280b158c..58e440df1bc6 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -257,7 +257,7 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 
 	fn = no_llseek;
 	if (file->f_mode & FMODE_LSEEK) {
-		if (file->f_op && file->f_op->llseek)
+		if (file->f_op->llseek)
 			fn = file->f_op->llseek;
 	}
 	return fn(file, offset, whence);
@@ -384,7 +384,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 
 	if (!(file->f_mode & FMODE_READ))
 		return -EBADF;
-	if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
+	if (!file->f_op->read && !file->f_op->aio_read)
 		return -EINVAL;
 	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
 		return -EFAULT;
@@ -433,7 +433,7 @@ ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t
 	const char __user *p;
 	ssize_t ret;
 
-	if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
+	if (!file->f_op->write && !file->f_op->aio_write)
 		return -EINVAL;
 
 	old_fs = get_fs();
@@ -460,7 +460,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
 
 	if (!(file->f_mode & FMODE_WRITE))
 		return -EBADF;
-	if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
+	if (!file->f_op->write && !file->f_op->aio_write)
 		return -EINVAL;
 	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
 		return -EFAULT;
@@ -727,11 +727,6 @@ static ssize_t do_readv_writev(int type, struct file *file,
 	io_fn_t fn;
 	iov_fn_t fnv;
 
-	if (!file->f_op) {
-		ret = -EINVAL;
-		goto out;
-	}
-
 	ret = rw_copy_check_uvector(type, uvector, nr_segs,
 				    ARRAY_SIZE(iovstack), iovstack, &iov);
 	if (ret <= 0)
@@ -778,7 +773,7 @@ ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 {
 	if (!(file->f_mode & FMODE_READ))
 		return -EBADF;
-	if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
+	if (!file->f_op->aio_read && !file->f_op->read)
 		return -EINVAL;
 
 	return do_readv_writev(READ, file, vec, vlen, pos);
@@ -791,7 +786,7 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 {
 	if (!(file->f_mode & FMODE_WRITE))
 		return -EBADF;
-	if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
+	if (!file->f_op->aio_write && !file->f_op->write)
 		return -EINVAL;
 
 	return do_readv_writev(WRITE, file, vec, vlen, pos);
@@ -906,10 +901,6 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 	io_fn_t fn;
 	iov_fn_t fnv;
 
-	ret = -EINVAL;
-	if (!file->f_op)
-		goto out;
-
 	ret = -EFAULT;
 	if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
 		goto out;
@@ -965,7 +956,7 @@ static size_t compat_readv(struct file *file,
 		goto out;
 
 	ret = -EINVAL;
-	if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
+	if (!file->f_op->aio_read && !file->f_op->read)
 		goto out;
 
 	ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
@@ -1032,7 +1023,7 @@ static size_t compat_writev(struct file *file,
 		goto out;
 
 	ret = -EINVAL;
-	if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
+	if (!file->f_op->aio_write && !file->f_op->write)
 		goto out;
 
 	ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
diff --git a/fs/readdir.c b/fs/readdir.c
index 93d71e574310..5b53d995cae6 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -24,7 +24,7 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
 {
 	struct inode *inode = file_inode(file);
 	int res = -ENOTDIR;
-	if (!file->f_op || !file->f_op->iterate)
+	if (!file->f_op->iterate)
 		goto out;
 
 	res = security_file_permission(file, MAY_READ);
diff --git a/fs/select.c b/fs/select.c
index 35d4adc749d9..55278a36a658 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -455,7 +455,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 					const struct file_operations *f_op;
 					f_op = f.file->f_op;
 					mask = DEFAULT_POLLMASK;
-					if (f_op && f_op->poll) {
+					if (f_op->poll) {
 						wait_key_set(wait, in, out,
 							     bit, busy_flag);
 						mask = (*f_op->poll)(f.file, wait);
@@ -762,7 +762,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
 		mask = POLLNVAL;
 		if (f.file) {
 			mask = DEFAULT_POLLMASK;
-			if (f.file->f_op && f.file->f_op->poll) {
+			if (f.file->f_op->poll) {
 				pwait->_key = pollfd->events|POLLERR|POLLHUP;
 				pwait->_key |= busy_flag;
 				mask = f.file->f_op->poll(f.file, pwait);
diff --git a/fs/splice.c b/fs/splice.c
index 3b7ee656f3aa..46a08f772d7d 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -695,7 +695,7 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 	loff_t pos = sd->pos;
 	int more;
 
-	if (!likely(file->f_op && file->f_op->sendpage))
+	if (!likely(file->f_op->sendpage))
 		return -EINVAL;
 
 	more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
@@ -1099,7 +1099,7 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
 				loff_t *, size_t, unsigned int);
 
-	if (out->f_op && out->f_op->splice_write)
+	if (out->f_op->splice_write)
 		splice_write = out->f_op->splice_write;
 	else
 		splice_write = default_file_splice_write;
@@ -1125,7 +1125,7 @@ static long do_splice_to(struct file *in, loff_t *ppos,
 	if (unlikely(ret < 0))
 		return ret;
 
-	if (in->f_op && in->f_op->splice_read)
+	if (in->f_op->splice_read)
 		splice_read = in->f_op->splice_read;
 	else
 		splice_read = default_file_splice_read;
diff --git a/fs/sync.c b/fs/sync.c
index 905f3f6b3d85..6c0ca3b75758 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -177,7 +177,7 @@ SYSCALL_DEFINE1(syncfs, int, fd)
  */
 int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
 {
-	if (!file->f_op || !file->f_op->fsync)
+	if (!file->f_op->fsync)
 		return -EINVAL;
 	return file->f_op->fsync(file, start, end, datasync);
 }
diff --git a/mm/memory.c b/mm/memory.c
index ca0003947115..e2bbba42604c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -681,7 +681,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
 	if (vma->vm_ops)
 		printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n",
 		       vma->vm_ops->fault);
-	if (vma->vm_file && vma->vm_file->f_op)
+	if (vma->vm_file)
 		printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n",
 		       vma->vm_file->f_op->mmap);
 	dump_stack();
diff --git a/mm/mmap.c b/mm/mmap.c
index 9d548512ff8a..ab199dfc9e26 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1299,7 +1299,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 				vm_flags &= ~VM_MAYEXEC;
 			}
 
-			if (!file->f_op || !file->f_op->mmap)
+			if (!file->f_op->mmap)
 				return -ENODEV;
 			if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
 				return -EINVAL;
@@ -1951,7 +1951,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 		return -ENOMEM;
 
 	get_area = current->mm->get_unmapped_area;
-	if (file && file->f_op && file->f_op->get_unmapped_area)
+	if (file && file->f_op->get_unmapped_area)
 		get_area = file->f_op->get_unmapped_area;
 	addr = get_area(file, addr, len, pgoff, flags);
 	if (IS_ERR_VALUE(addr))
diff --git a/mm/nommu.c b/mm/nommu.c
index ecd1f158548e..9e6cb02cba64 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -937,7 +937,7 @@ static int validate_mmap_request(struct file *file,
 		struct address_space *mapping;
 
 		/* files must support mmap */
-		if (!file->f_op || !file->f_op->mmap)
+		if (!file->f_op->mmap)
 			return -ENODEV;
 
 		/* work out if what we've got could possibly be shared
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 3ffda1b3799b..9321a7763067 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -244,10 +244,10 @@ p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt)
 	if (!ts)
 		return -EREMOTEIO;
 
-	if (!ts->rd->f_op || !ts->rd->f_op->poll)
+	if (!ts->rd->f_op->poll)
 		return -EIO;
 
-	if (!ts->wr->f_op || !ts->wr->f_op->poll)
+	if (!ts->wr->f_op->poll)
 		return -EIO;
 
 	ret = ts->rd->f_op->poll(ts->rd, pt);
-- 
cgit v1.3


From 59aa0da8e232f990be793d0ea23ca74675b236b3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 16 Sep 2013 21:34:53 -0400
Subject: initialize namespace_sem statically

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 5918fc31a636..827aea4c3242 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -39,7 +39,7 @@ static int mnt_group_start = 1;
 static struct list_head *mount_hashtable __read_mostly;
 static struct list_head *mountpoint_hashtable __read_mostly;
 static struct kmem_cache *mnt_cache __read_mostly;
-static struct rw_semaphore namespace_sem;
+static DECLARE_RWSEM(namespace_sem);
 
 /* /sys/fs */
 struct kobject *fs_kobj;
@@ -2767,8 +2767,6 @@ void __init mnt_init(void)
 	unsigned u;
 	int err;
 
-	init_rwsem(&namespace_sem);
-
 	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
 			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
 
-- 
cgit v1.3


From 44bb4385ce1cfb231d37cbc7bb6faf70f98b7b17 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 16 Sep 2013 21:37:36 -0400
Subject: fs_is_visible only needs namespace_sem held shared

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 827aea4c3242..0e54ad6ccd8a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2869,7 +2869,7 @@ bool fs_fully_visible(struct file_system_type *type)
 	if (unlikely(!ns))
 		return false;
 
-	namespace_lock();
+	down_read(&namespace_sem);
 	list_for_each_entry(mnt, &ns->list, mnt_list) {
 		struct mount *child;
 		if (mnt->mnt.mnt_sb->s_type != type)
@@ -2890,7 +2890,7 @@ bool fs_fully_visible(struct file_system_type *type)
 	next:	;
 	}
 found:
-	namespace_unlock();
+	up_read(&namespace_sem);
 	return visible;
 }
 
-- 
cgit v1.3


From aa7a574d0c54cc5a0aceb7357b5097342c0844ee Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 16 Sep 2013 22:22:16 -0400
Subject: dup_mnt_ns(): get rid of pointless grabbing of vfsmount_lock

mnt_list is protected by namespace_sem, not vfsmount_lock

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 0e54ad6ccd8a..17ff1470b405 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2444,9 +2444,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 		return ERR_CAST(new);
 	}
 	new_ns->root = new;
-	br_write_lock(&vfsmount_lock);
 	list_add_tail(&new_ns->list, &new->mnt_list);
-	br_write_unlock(&vfsmount_lock);
 
 	/*
 	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
-- 
cgit v1.3


From 6339dab869e07a81abd22970696dde6febee094a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 16 Sep 2013 22:41:01 -0400
Subject: do_remount(): pull touch_mnt_namespace() up

... and don't bother with dropping and regaining vfsmount_lock

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 17ff1470b405..3ee6e59ead55 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1849,14 +1849,10 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 		br_write_lock(&vfsmount_lock);
 		mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK;
 		mnt->mnt.mnt_flags = mnt_flags;
-		br_write_unlock(&vfsmount_lock);
-	}
-	up_write(&sb->s_umount);
-	if (!err) {
-		br_write_lock(&vfsmount_lock);
 		touch_mnt_namespace(mnt->mnt_ns);
 		br_write_unlock(&vfsmount_lock);
 	}
+	up_write(&sb->s_umount);
 	return err;
 }
 
-- 
cgit v1.3


From 649a795affac8507657a72e761ce987d6967016b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 28 Sep 2013 12:41:25 -0400
Subject: fold mntfree() into mntput_no_expire()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 39 ++++++++++++++++-----------------------
 1 file changed, 16 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 3ee6e59ead55..691cd7c83275 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -872,28 +872,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	return ERR_PTR(err);
 }
 
-static inline void mntfree(struct mount *mnt)
-{
-	struct vfsmount *m = &mnt->mnt;
-	struct super_block *sb = m->mnt_sb;
-
-	/*
-	 * This probably indicates that somebody messed
-	 * up a mnt_want/drop_write() pair.  If this
-	 * happens, the filesystem was probably unable
-	 * to make r/w->r/o transitions.
-	 */
-	/*
-	 * The locking used to deal with mnt_count decrement provides barriers,
-	 * so mnt_get_writers() below is safe.
-	 */
-	WARN_ON(mnt_get_writers(mnt));
-	fsnotify_vfsmount_delete(m);
-	dput(m->mnt_root);
-	free_vfsmnt(mnt);
-	deactivate_super(sb);
-}
-
 static void mntput_no_expire(struct mount *mnt)
 {
 put_again:
@@ -929,7 +907,22 @@ put_again:
 
 	list_del(&mnt->mnt_instance);
 	br_write_unlock(&vfsmount_lock);
-	mntfree(mnt);
+
+	/*
+	 * This probably indicates that somebody messed
+	 * up a mnt_want/drop_write() pair.  If this
+	 * happens, the filesystem was probably unable
+	 * to make r/w->r/o transitions.
+	 */
+	/*
+	 * The locking used to deal with mnt_count decrement provides barriers,
+	 * so mnt_get_writers() below is safe.
+	 */
+	WARN_ON(mnt_get_writers(mnt));
+	fsnotify_vfsmount_delete(&mnt->mnt);
+	dput(mnt->mnt.mnt_root);
+	deactivate_super(mnt->mnt.mnt_sb);
+	free_vfsmnt(mnt);
 }
 
 void mntput(struct vfsmount *mnt)
-- 
cgit v1.3


From 085e83ff0c26842e6b14cd4b471625f6dd8a7bfd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 28 Sep 2013 12:54:06 -0400
Subject: fs/namespace.c: bury long-dead define

MNT_WRITER_UNDERFLOW_LIMIT has been missed 4 years ago when it became unused.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 691cd7c83275..cd7dbabf0c6c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -63,8 +63,6 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 	return tmp & (HASH_SIZE - 1);
 }
 
-#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
-
 /*
  * allocation is serialized by namespace_sem, but we need the spinlock to
  * serialize with freeing.
-- 
cgit v1.3


From 22a7919299c58b768fa86200e5ac651d3e33a37f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 28 Sep 2013 20:29:00 -0400
Subject: finish_automount() doesn't need vfsmount_lock for removal from expiry
 list

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index cd7dbabf0c6c..f57117d9f0ee 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2064,9 +2064,7 @@ fail:
 	/* remove m from any expiration list it may be on */
 	if (!list_empty(&mnt->mnt_expire)) {
 		namespace_lock();
-		br_write_lock(&vfsmount_lock);
 		list_del_init(&mnt->mnt_expire);
-		br_write_unlock(&vfsmount_lock);
 		namespace_unlock();
 	}
 	mntput(m);
-- 
cgit v1.3


From f6b742d8697ae0aeacb025e6e0ab3c61a6918846 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 28 Sep 2013 20:30:00 -0400
Subject: mnt_set_expiry() doesn't need vfsmount_lock

->mnt_expire is protected by namespace_sem

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index f57117d9f0ee..e45503d62fed 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2080,11 +2080,9 @@ fail:
 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
 {
 	namespace_lock();
-	br_write_lock(&vfsmount_lock);
 
 	list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
 
-	br_write_unlock(&vfsmount_lock);
 	namespace_unlock();
 }
 EXPORT_SYMBOL(mnt_set_expiry);
-- 
cgit v1.3


From 9559f68915024ece8be3781003f25aef3677e574 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 28 Sep 2013 20:47:57 -0400
Subject: fold dup_mnt_ns() into its only surviving caller

should've been done 6 years ago...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 48 ++++++++++++++++++------------------------------
 1 file changed, 18 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index e45503d62fed..c8d214ce30c6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2397,20 +2397,25 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
 	return new_ns;
 }
 
-/*
- * Allocate a new namespace structure and populate it with contents
- * copied from the namespace of the passed in task structure.
- */
-static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
-		struct user_namespace *user_ns, struct fs_struct *fs)
+struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
+		struct user_namespace *user_ns, struct fs_struct *new_fs)
 {
 	struct mnt_namespace *new_ns;
 	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
 	struct mount *p, *q;
-	struct mount *old = mnt_ns->root;
+	struct mount *old;
 	struct mount *new;
 	int copy_flags;
 
+	BUG_ON(!ns);
+
+	if (likely(!(flags & CLONE_NEWNS))) {
+		get_mnt_ns(ns);
+		return ns;
+	}
+
+	old = ns->root;
+
 	new_ns = alloc_mnt_ns(user_ns);
 	if (IS_ERR(new_ns))
 		return new_ns;
@@ -2418,7 +2423,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 	namespace_lock();
 	/* First pass: copy the tree topology */
 	copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
-	if (user_ns != mnt_ns->user_ns)
+	if (user_ns != ns->user_ns)
 		copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
 	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
 	if (IS_ERR(new)) {
@@ -2438,13 +2443,13 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 	q = new;
 	while (p) {
 		q->mnt_ns = new_ns;
-		if (fs) {
-			if (&p->mnt == fs->root.mnt) {
-				fs->root.mnt = mntget(&q->mnt);
+		if (new_fs) {
+			if (&p->mnt == new_fs->root.mnt) {
+				new_fs->root.mnt = mntget(&q->mnt);
 				rootmnt = &p->mnt;
 			}
-			if (&p->mnt == fs->pwd.mnt) {
-				fs->pwd.mnt = mntget(&q->mnt);
+			if (&p->mnt == new_fs->pwd.mnt) {
+				new_fs->pwd.mnt = mntget(&q->mnt);
 				pwdmnt = &p->mnt;
 			}
 		}
@@ -2465,23 +2470,6 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 	return new_ns;
 }
 
-struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
-		struct user_namespace *user_ns, struct fs_struct *new_fs)
-{
-	struct mnt_namespace *new_ns;
-
-	BUG_ON(!ns);
-	get_mnt_ns(ns);
-
-	if (!(flags & CLONE_NEWNS))
-		return ns;
-
-	new_ns = dup_mnt_ns(ns, user_ns, new_fs);
-
-	put_mnt_ns(ns);
-	return new_ns;
-}
-
 /**
  * create_mnt_ns - creates a private namespace and adds a root filesystem
  * @mnt: pointer to the new root filesystem mountpoint
-- 
cgit v1.3


From aba809cf0944fdc5a83ef8b5864cdce2a99b2513 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 28 Sep 2013 23:10:55 -0400
Subject: namespace.c: get rid of mnt_ghosts

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/mount.h     |  2 +-
 fs/namespace.c | 22 +++++++---------------
 fs/pnode.c     |  3 +--
 3 files changed, 9 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/mount.h b/fs/mount.h
index 64a858143ff9..3168dc41bad5 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -55,7 +55,7 @@ struct mount {
 	int mnt_group_id;		/* peer group identifier */
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	int mnt_pinned;
-	int mnt_ghosts;
+	struct path mnt_ex_mountpoint;
 };
 
 #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
diff --git a/fs/namespace.c b/fs/namespace.c
index c8d214ce30c6..846ea43ab0c6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1136,20 +1136,8 @@ static void namespace_unlock(void)
 	while (!list_empty(&head)) {
 		mnt = list_first_entry(&head, struct mount, mnt_hash);
 		list_del_init(&mnt->mnt_hash);
-		if (mnt_has_parent(mnt)) {
-			struct dentry *dentry;
-			struct mount *m;
-
-			br_write_lock(&vfsmount_lock);
-			dentry = mnt->mnt_mountpoint;
-			m = mnt->mnt_parent;
-			mnt->mnt_mountpoint = mnt->mnt.mnt_root;
-			mnt->mnt_parent = mnt;
-			m->mnt_ghosts--;
-			br_write_unlock(&vfsmount_lock);
-			dput(dentry);
-			mntput(&m->mnt);
-		}
+		if (mnt->mnt_ex_mountpoint.mnt)
+			path_put(&mnt->mnt_ex_mountpoint);
 		mntput(&mnt->mnt);
 	}
 }
@@ -1181,8 +1169,12 @@ void umount_tree(struct mount *mnt, int propagate)
 		p->mnt_ns = NULL;
 		list_del_init(&p->mnt_child);
 		if (mnt_has_parent(p)) {
-			p->mnt_parent->mnt_ghosts++;
 			put_mountpoint(p->mnt_mp);
+			/* move the reference to mountpoint into ->mnt_ex_mountpoint */
+			p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
+			p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
+			p->mnt_mountpoint = p->mnt.mnt_root;
+			p->mnt_parent = p;
 			p->mnt_mp = NULL;
 		}
 		change_mnt_propagation(p, MS_PRIVATE);
diff --git a/fs/pnode.c b/fs/pnode.c
index 9af0df15256e..3cfd48cf887e 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -278,8 +278,7 @@ out:
  */
 static inline int do_refcount_check(struct mount *mnt, int count)
 {
-	int mycount = mnt_get_count(mnt) - mnt->mnt_ghosts;
-	return (mycount > count);
+	return mnt_get_count(mnt) > count;
 }
 
 /*
-- 
cgit v1.3


From aab407fc5c0ce63e9fd4a34a790d7290d3e116a1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 29 Sep 2013 10:59:59 -0400
Subject: don't bother with vfsmount_lock in mounts_poll()

wake_up_interruptible/poll_wait provide sufficient barriers;
just use ACCESS_ONCE() to fetch ns->event and that's it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc_namespace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 5fe34c355e85..439406e081af 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -20,15 +20,15 @@ static unsigned mounts_poll(struct file *file, poll_table *wait)
 	struct proc_mounts *p = proc_mounts(file->private_data);
 	struct mnt_namespace *ns = p->ns;
 	unsigned res = POLLIN | POLLRDNORM;
+	int event;
 
 	poll_wait(file, &p->ns->poll, wait);
 
-	br_read_lock(&vfsmount_lock);
-	if (p->m.poll_event != ns->event) {
-		p->m.poll_event = ns->event;
+	event = ACCESS_ONCE(ns->event);
+	if (p->m.poll_event != event) {
+		p->m.poll_event = event;
 		res |= POLLERR | POLLPRI;
 	}
-	br_read_unlock(&vfsmount_lock);
 
 	return res;
 }
-- 
cgit v1.3


From 719ea2fbb553ab3f61a174a4b5861289dcc46cb1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 29 Sep 2013 11:24:49 -0400
Subject: new helpers: lock_mount_hash/unlock_mount_hash

aka br_write_{lock,unlock} of vfsmount_lock.  Inlines in fs/mount.h,
vfsmount_lock extern moved over there as well.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fs-writeback.c |  1 +
 fs/internal.h     |  4 ---
 fs/mount.h        | 13 ++++++++
 fs/namespace.c    | 96 +++++++++++++++++++++++++++----------------------------
 fs/pnode.c        |  4 +--
 5 files changed, 64 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9f4935b8f208..09c11329a17c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -26,6 +26,7 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/tracepoint.h>
+#include <linux/device.h>
 #include "internal.h"
 
 /*
diff --git a/fs/internal.h b/fs/internal.h
index 513e0d859a6c..4a11e75ce14d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -9,8 +9,6 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#include <linux/lglock.h>
-
 struct super_block;
 struct file_system_type;
 struct linux_binprm;
@@ -62,8 +60,6 @@ extern int sb_prepare_remount_readonly(struct super_block *);
 
 extern void __init mnt_init(void);
 
-extern struct lglock vfsmount_lock;
-
 extern int __mnt_want_write(struct vfsmount *);
 extern int __mnt_want_write_file(struct file *);
 extern void __mnt_drop_write(struct vfsmount *);
diff --git a/fs/mount.h b/fs/mount.h
index 3168dc41bad5..7076f25af35d 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -1,6 +1,7 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/poll.h>
+#include <linux/lglock.h>
 
 struct mnt_namespace {
 	atomic_t		count;
@@ -83,6 +84,18 @@ static inline void get_mnt_ns(struct mnt_namespace *ns)
 	atomic_inc(&ns->count);
 }
 
+extern struct lglock vfsmount_lock;
+
+static inline void lock_mount_hash(void)
+{
+	br_write_lock(&vfsmount_lock);
+}
+
+static inline void unlock_mount_hash(void)
+{
+	br_write_unlock(&vfsmount_lock);
+}
+
 struct proc_mounts {
 	struct seq_file m;
 	struct mnt_namespace *ns;
diff --git a/fs/namespace.c b/fs/namespace.c
index 846ea43ab0c6..5cbe8cefadb5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -456,7 +456,7 @@ static int mnt_make_readonly(struct mount *mnt)
 {
 	int ret = 0;
 
-	br_write_lock(&vfsmount_lock);
+	lock_mount_hash();
 	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
 	/*
 	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -490,15 +490,15 @@ static int mnt_make_readonly(struct mount *mnt)
 	 */
 	smp_wmb();
 	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
-	br_write_unlock(&vfsmount_lock);
+	unlock_mount_hash();
 	return ret;
 }
 
 static void __mnt_unmake_readonly(struct mount *mnt)
 {
-	br_write_lock(&vfsmount_lock);
+	lock_mount_hash();
 	mnt->mnt.mnt_flags &= ~MNT_READONLY;
-	br_write_unlock(&vfsmount_lock);
+	unlock_mount_hash();
 }
 
 int sb_prepare_remount_readonly(struct super_block *sb)
@@ -510,7 +510,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
 	if (atomic_long_read(&sb->s_remove_count))
 		return -EBUSY;
 
-	br_write_lock(&vfsmount_lock);
+	lock_mount_hash();
 	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
 		if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
 			mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
@@ -532,7 +532,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
 		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
 			mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
 	}
-	br_write_unlock(&vfsmount_lock);
+	unlock_mount_hash();
 
 	return err;
 }
@@ -794,9 +794,9 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 	mnt->mnt.mnt_sb = root->d_sb;
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 	mnt->mnt_parent = mnt;
-	br_write_lock(&vfsmount_lock);
+	lock_mount_hash();
 	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
-	br_write_unlock(&vfsmount_lock);
+	unlock_mount_hash();
 	return &mnt->mnt;
 }
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
@@ -837,9 +837,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	mnt->mnt.mnt_root = dget(root);
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 	mnt->mnt_parent = mnt;
-	br_write_lock(&vfsmount_lock);
+	lock_mount_hash();
 	list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
-	br_write_unlock(&vfsmount_lock);
+	unlock_mount_hash();
 
 	if ((flag & CL_SLAVE) ||
 	    ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
@@ -883,28 +883,28 @@ put_again:
 	}
 	br_read_unlock(&vfsmount_lock);
 
-	br_write_lock(&vfsmount_lock);
+	lock_mount_hash();
 	mnt_add_count(mnt, -1);
 	if (mnt_get_count(mnt)) {
-		br_write_unlock(&vfsmount_lock);
+		unlock_mount_hash();
 		return;
 	}
 #else
 	mnt_add_count(mnt, -1);
 	if (likely(mnt_get_count(mnt)))
 		return;
-	br_write_lock(&vfsmount_lock);
+	lock_mount_hash();
 #endif
 	if (unlikely(mnt->mnt_pinned)) {
 		mnt_add_count(mnt, mnt->mnt_pinned + 1);
 		mnt->mnt_pinned = 0;
-		br_write_unlock(&vfsmount_lock);
+		unlock_mount_hash();
 		acct_auto_close_mnt(&mnt->mnt);
 		goto put_again;
 	}
 
 	list_del(&mnt->mnt_instance);
-	br_write_unlock(&vfsmount_lock);
+	unlock_mount_hash();
 
 	/*
 	 * This probably indicates that somebody messed
@@ -945,21 +945,21 @@ EXPORT_SYMBOL(mntget);
 
 void mnt_pin(struct vfsmount *mnt)
 {
-	br_write_lock(&vfsmount_lock);
+	lock_mount_hash();
 	real_mount(mnt)->mnt_pinned++;
-	br_write_unlock(&vfsmount_lock);
+	unlock_mount_hash();
 }
 EXPORT_SYMBOL(mnt_pin);
 
 void mnt_unpin(struct vfsmount *m)
 {
 	struct mount *mnt = real_mount(m);
-	br_write_lock(&vfsmount_lock);
+	lock_mount_hash();
 	if (mnt->mnt_pinned) {
 		mnt_add_count(mnt, 1);
 		mnt->mnt_pinned--;
 	}
-	br_write_unlock(&vfsmount_lock);
+	unlock_mount_hash();
 }
 EXPORT_SYMBOL(mnt_unpin);
 
@@ -1076,12 +1076,12 @@ int may_umount_tree(struct vfsmount *m)
 	BUG_ON(!m);
 
 	/* write lock needed for mnt_get_count */
-	br_write_lock(&vfsmount_lock);
+	lock_mount_hash();
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		actual_refs += mnt_get_count(p);
 		minimum_refs += 2;
 	}
-	br_write_unlock(&vfsmount_lock);
+	unlock_mount_hash();
 
 	if (actual_refs > minimum_refs)
 		return 0;
@@ -1108,10 +1108,10 @@ int may_umount(struct vfsmount *mnt)
 {
 	int ret = 1;
 	down_read(&namespace_sem);
-	br_write_lock(&vfsmount_lock);
+	lock_mount_hash();
 	if (propagate_mount_busy(real_mount(mnt), 2))
 		ret = 0;
-	br_write_unlock(&vfsmount_lock);
+	unlock_mount_hash();
 	up_read(&namespace_sem);
 	return ret;
 }
@@ -1208,12 +1208,12 @@ static int do_umount(struct mount *mnt, int flags)
 		 * probably don't strictly need the lock here if we examined
 		 * all race cases, but it's a slowpath.
 		 */
-		br_write_lock(&vfsmount_lock);
+		lock_mount_hash();
 		if (mnt_get_count(mnt) != 2) {
-			br_write_unlock(&vfsmount_lock);
+			unlock_mount_hash();
 			return -EBUSY;
 		}
-		br_write_unlock(&vfsmount_lock);
+		unlock_mount_hash();
 
 		if (!xchg(&mnt->mnt_expiry_mark, 1))
 			return -EAGAIN;
@@ -1255,7 +1255,7 @@ static int do_umount(struct mount *mnt, int flags)
 	}
 
 	namespace_lock();
-	br_write_lock(&vfsmount_lock);
+	lock_mount_hash();
 	event++;
 
 	if (!(flags & MNT_DETACH))
@@ -1267,7 +1267,7 @@ static int do_umount(struct mount *mnt, int flags)
 			umount_tree(mnt, 1);
 		retval = 0;
 	}
-	br_write_unlock(&vfsmount_lock);
+	unlock_mount_hash();
 	namespace_unlock();
 	return retval;
 }
@@ -1410,18 +1410,18 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 			q = clone_mnt(p, p->mnt.mnt_root, flag);
 			if (IS_ERR(q))
 				goto out;
-			br_write_lock(&vfsmount_lock);
+			lock_mount_hash();
 			list_add_tail(&q->mnt_list, &res->mnt_list);
 			attach_mnt(q, parent, p->mnt_mp);
-			br_write_unlock(&vfsmount_lock);
+			unlock_mount_hash();
 		}
 	}
 	return res;
 out:
 	if (res) {
-		br_write_lock(&vfsmount_lock);
+		lock_mount_hash();
 		umount_tree(res, 0);
-		br_write_unlock(&vfsmount_lock);
+		unlock_mount_hash();
 	}
 	return q;
 }
@@ -1443,9 +1443,9 @@ struct vfsmount *collect_mounts(struct path *path)
 void drop_collected_mounts(struct vfsmount *mnt)
 {
 	namespace_lock();
-	br_write_lock(&vfsmount_lock);
+	lock_mount_hash();
 	umount_tree(real_mount(mnt), 0);
-	br_write_unlock(&vfsmount_lock);
+	unlock_mount_hash();
 	namespace_unlock();
 }
 
@@ -1572,7 +1572,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 	if (err)
 		goto out_cleanup_ids;
 
-	br_write_lock(&vfsmount_lock);
+	lock_mount_hash();
 
 	if (IS_MNT_SHARED(dest_mnt)) {
 		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1591,7 +1591,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 		list_del_init(&child->mnt_hash);
 		commit_tree(child);
 	}
-	br_write_unlock(&vfsmount_lock);
+	unlock_mount_hash();
 
 	return 0;
 
@@ -1693,10 +1693,10 @@ static int do_change_type(struct path *path, int flag)
 			goto out_unlock;
 	}
 
-	br_write_lock(&vfsmount_lock);
+	lock_mount_hash();
 	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
 		change_mnt_propagation(m, type);
-	br_write_unlock(&vfsmount_lock);
+	unlock_mount_hash();
 
  out_unlock:
 	namespace_unlock();
@@ -1768,9 +1768,9 @@ static int do_loopback(struct path *path, const char *old_name,
 
 	err = graft_tree(mnt, parent, mp);
 	if (err) {
-		br_write_lock(&vfsmount_lock);
+		lock_mount_hash();
 		umount_tree(mnt, 0);
-		br_write_unlock(&vfsmount_lock);
+		unlock_mount_hash();
 	}
 out2:
 	unlock_mount(mp);
@@ -1829,11 +1829,11 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 	else
 		err = do_remount_sb(sb, flags, data, 0);
 	if (!err) {
-		br_write_lock(&vfsmount_lock);
+		lock_mount_hash();
 		mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK;
 		mnt->mnt.mnt_flags = mnt_flags;
 		touch_mnt_namespace(mnt->mnt_ns);
-		br_write_unlock(&vfsmount_lock);
+		unlock_mount_hash();
 	}
 	up_write(&sb->s_umount);
 	return err;
@@ -2093,7 +2093,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
 		return;
 
 	namespace_lock();
-	br_write_lock(&vfsmount_lock);
+	lock_mount_hash();
 
 	/* extract from the expiration list every vfsmount that matches the
 	 * following criteria:
@@ -2112,7 +2112,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
 		touch_mnt_namespace(mnt->mnt_ns);
 		umount_tree(mnt, 1);
 	}
-	br_write_unlock(&vfsmount_lock);
+	unlock_mount_hash();
 	namespace_unlock();
 }
 
@@ -2662,7 +2662,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	if (!is_path_reachable(old_mnt, old.dentry, &new))
 		goto out4;
 	root_mp->m_count++; /* pin it so it won't go away */
-	br_write_lock(&vfsmount_lock);
+	lock_mount_hash();
 	detach_mnt(new_mnt, &parent_path);
 	detach_mnt(root_mnt, &root_parent);
 	if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
@@ -2674,7 +2674,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	/* mount new_root on / */
 	attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
-	br_write_unlock(&vfsmount_lock);
+	unlock_mount_hash();
 	chroot_fs_refs(&root, &new);
 	put_mountpoint(root_mp);
 	error = 0;
@@ -2784,9 +2784,9 @@ void kern_unmount(struct vfsmount *mnt)
 {
 	/* release long term mount so mount point can be released */
 	if (!IS_ERR_OR_NULL(mnt)) {
-		br_write_lock(&vfsmount_lock);
+		lock_mount_hash();
 		real_mount(mnt)->mnt_ns = NULL;
-		br_write_unlock(&vfsmount_lock);
+		unlock_mount_hash();
 		mntput(mnt);
 	}
 }
diff --git a/fs/pnode.c b/fs/pnode.c
index 3cfd48cf887e..58933fd149ad 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -264,12 +264,12 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 		prev_src_mnt  = child;
 	}
 out:
-	br_write_lock(&vfsmount_lock);
+	lock_mount_hash();
 	while (!list_empty(&tmp_list)) {
 		child = list_first_entry(&tmp_list, struct mount, mnt_hash);
 		umount_tree(child, 0);
 	}
-	br_write_unlock(&vfsmount_lock);
+	unlock_mount_hash();
 	return ret;
 }
 
-- 
cgit v1.3


From 966c1f75f8e1e8e44d8277f7cc3598f6abbdda2f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 29 Sep 2013 18:09:05 -0400
Subject: isofs: don't pass dentry to isofs_hash{i,}_common()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/isofs/inode.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index e5d408a7ea4a..4a9e10ea13f2 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -181,7 +181,7 @@ struct iso9660_options{
  * Compute the hash for the isofs name corresponding to the dentry.
  */
 static int
-isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms)
+isofs_hash_common(struct qstr *qstr, int ms)
 {
 	const char *name;
 	int len;
@@ -202,7 +202,7 @@ isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms)
  * Compute the hash for the isofs name corresponding to the dentry.
  */
 static int
-isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms)
+isofs_hashi_common(struct qstr *qstr, int ms)
 {
 	const char *name;
 	int len;
@@ -259,13 +259,13 @@ static int isofs_dentry_cmp_common(
 static int
 isofs_hash(const struct dentry *dentry, struct qstr *qstr)
 {
-	return isofs_hash_common(dentry, qstr, 0);
+	return isofs_hash_common(qstr, 0);
 }
 
 static int
 isofs_hashi(const struct dentry *dentry, struct qstr *qstr)
 {
-	return isofs_hashi_common(dentry, qstr, 0);
+	return isofs_hashi_common(qstr, 0);
 }
 
 static int
@@ -286,13 +286,13 @@ isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry,
 static int
 isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr)
 {
-	return isofs_hash_common(dentry, qstr, 1);
+	return isofs_hash_common(qstr, 1);
 }
 
 static int
 isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr)
 {
-	return isofs_hashi_common(dentry, qstr, 1);
+	return isofs_hashi_common(qstr, 1);
 }
 
 static int
-- 
cgit v1.3


From 7eb5e8826911f2792179f99e77e75fbb7ef53a4a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 1 Oct 2013 15:09:58 -0400
Subject: uninline destroy_super(), consolidate alloc_super()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c | 206 ++++++++++++++++++++++++++-----------------------------------
 1 file changed, 86 insertions(+), 120 deletions(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index 0225c20f8770..efa6e488a95c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -129,33 +129,27 @@ static unsigned long super_cache_count(struct shrinker *shrink,
 	return total_objects;
 }
 
-static int init_sb_writers(struct super_block *s, struct file_system_type *type)
-{
-	int err;
-	int i;
-
-	for (i = 0; i < SB_FREEZE_LEVELS; i++) {
-		err = percpu_counter_init(&s->s_writers.counter[i], 0);
-		if (err < 0)
-			goto err_out;
-		lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
-				 &type->s_writers_key[i], 0);
-	}
-	init_waitqueue_head(&s->s_writers.wait);
-	init_waitqueue_head(&s->s_writers.wait_unfrozen);
-	return 0;
-err_out:
-	while (--i >= 0)
-		percpu_counter_destroy(&s->s_writers.counter[i]);
-	return err;
-}
-
-static void destroy_sb_writers(struct super_block *s)
+/**
+ *	destroy_super	-	frees a superblock
+ *	@s: superblock to free
+ *
+ *	Frees a superblock.
+ */
+static void destroy_super(struct super_block *s)
 {
 	int i;
-
+	list_lru_destroy(&s->s_dentry_lru);
+	list_lru_destroy(&s->s_inode_lru);
+#ifdef CONFIG_SMP
+	free_percpu(s->s_files);
+#endif
 	for (i = 0; i < SB_FREEZE_LEVELS; i++)
 		percpu_counter_destroy(&s->s_writers.counter[i]);
+	security_sb_free(s);
+	WARN_ON(!list_empty(&s->s_mounts));
+	kfree(s->s_subtype);
+	kfree(s->s_options);
+	kfree(s);
 }
 
 /**
@@ -170,111 +164,83 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 {
 	struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
 	static const struct super_operations default_op;
+	int i;
+
+	if (!s)
+		return NULL;
 
-	if (s) {
-		if (security_sb_alloc(s))
-			goto out_free_sb;
+	if (security_sb_alloc(s))
+		goto fail;
 
 #ifdef CONFIG_SMP
-		s->s_files = alloc_percpu(struct list_head);
-		if (!s->s_files)
-			goto err_out;
-		else {
-			int i;
-
-			for_each_possible_cpu(i)
-				INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i));
-		}
+	s->s_files = alloc_percpu(struct list_head);
+	if (!s->s_files)
+		goto fail;
+	for_each_possible_cpu(i)
+		INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i));
 #else
-		INIT_LIST_HEAD(&s->s_files);
+	INIT_LIST_HEAD(&s->s_files);
 #endif
-		if (init_sb_writers(s, type))
-			goto err_out;
-		s->s_flags = flags;
-		s->s_bdi = &default_backing_dev_info;
-		INIT_HLIST_NODE(&s->s_instances);
-		INIT_HLIST_BL_HEAD(&s->s_anon);
-		INIT_LIST_HEAD(&s->s_inodes);
-
-		if (list_lru_init(&s->s_dentry_lru))
-			goto err_out;
-		if (list_lru_init(&s->s_inode_lru))
-			goto err_out_dentry_lru;
-
-		INIT_LIST_HEAD(&s->s_mounts);
-		init_rwsem(&s->s_umount);
-		lockdep_set_class(&s->s_umount, &type->s_umount_key);
-		/*
-		 * sget() can have s_umount recursion.
-		 *
-		 * When it cannot find a suitable sb, it allocates a new
-		 * one (this one), and tries again to find a suitable old
-		 * one.
-		 *
-		 * In case that succeeds, it will acquire the s_umount
-		 * lock of the old one. Since these are clearly distrinct
-		 * locks, and this object isn't exposed yet, there's no
-		 * risk of deadlocks.
-		 *
-		 * Annotate this by putting this lock in a different
-		 * subclass.
-		 */
-		down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
-		s->s_count = 1;
-		atomic_set(&s->s_active, 1);
-		mutex_init(&s->s_vfs_rename_mutex);
-		lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
-		mutex_init(&s->s_dquot.dqio_mutex);
-		mutex_init(&s->s_dquot.dqonoff_mutex);
-		init_rwsem(&s->s_dquot.dqptr_sem);
-		s->s_maxbytes = MAX_NON_LFS;
-		s->s_op = &default_op;
-		s->s_time_gran = 1000000000;
-		s->cleancache_poolid = -1;
-
-		s->s_shrink.seeks = DEFAULT_SEEKS;
-		s->s_shrink.scan_objects = super_cache_scan;
-		s->s_shrink.count_objects = super_cache_count;
-		s->s_shrink.batch = 1024;
-		s->s_shrink.flags = SHRINKER_NUMA_AWARE;
+	for (i = 0; i < SB_FREEZE_LEVELS; i++) {
+		if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0)
+			goto fail;
+		lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
+				 &type->s_writers_key[i], 0);
 	}
-out:
+	init_waitqueue_head(&s->s_writers.wait);
+	init_waitqueue_head(&s->s_writers.wait_unfrozen);
+	s->s_flags = flags;
+	s->s_bdi = &default_backing_dev_info;
+	INIT_HLIST_NODE(&s->s_instances);
+	INIT_HLIST_BL_HEAD(&s->s_anon);
+	INIT_LIST_HEAD(&s->s_inodes);
+
+	if (list_lru_init(&s->s_dentry_lru))
+		goto fail;
+	if (list_lru_init(&s->s_inode_lru))
+		goto fail;
+
+	INIT_LIST_HEAD(&s->s_mounts);
+	init_rwsem(&s->s_umount);
+	lockdep_set_class(&s->s_umount, &type->s_umount_key);
+	/*
+	 * sget() can have s_umount recursion.
+	 *
+	 * When it cannot find a suitable sb, it allocates a new
+	 * one (this one), and tries again to find a suitable old
+	 * one.
+	 *
+	 * In case that succeeds, it will acquire the s_umount
+	 * lock of the old one. Since these are clearly distrinct
+	 * locks, and this object isn't exposed yet, there's no
+	 * risk of deadlocks.
+	 *
+	 * Annotate this by putting this lock in a different
+	 * subclass.
+	 */
+	down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
+	s->s_count = 1;
+	atomic_set(&s->s_active, 1);
+	mutex_init(&s->s_vfs_rename_mutex);
+	lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
+	mutex_init(&s->s_dquot.dqio_mutex);
+	mutex_init(&s->s_dquot.dqonoff_mutex);
+	init_rwsem(&s->s_dquot.dqptr_sem);
+	s->s_maxbytes = MAX_NON_LFS;
+	s->s_op = &default_op;
+	s->s_time_gran = 1000000000;
+	s->cleancache_poolid = -1;
+
+	s->s_shrink.seeks = DEFAULT_SEEKS;
+	s->s_shrink.scan_objects = super_cache_scan;
+	s->s_shrink.count_objects = super_cache_count;
+	s->s_shrink.batch = 1024;
+	s->s_shrink.flags = SHRINKER_NUMA_AWARE;
 	return s;
 
-err_out_dentry_lru:
-	list_lru_destroy(&s->s_dentry_lru);
-err_out:
-	security_sb_free(s);
-#ifdef CONFIG_SMP
-	if (s->s_files)
-		free_percpu(s->s_files);
-#endif
-	destroy_sb_writers(s);
-out_free_sb:
-	kfree(s);
-	s = NULL;
-	goto out;
-}
-
-/**
- *	destroy_super	-	frees a superblock
- *	@s: superblock to free
- *
- *	Frees a superblock.
- */
-static inline void destroy_super(struct super_block *s)
-{
-	list_lru_destroy(&s->s_dentry_lru);
-	list_lru_destroy(&s->s_inode_lru);
-#ifdef CONFIG_SMP
-	free_percpu(s->s_files);
-#endif
-	destroy_sb_writers(s);
-	security_sb_free(s);
-	WARN_ON(!list_empty(&s->s_mounts));
-	kfree(s->s_subtype);
-	kfree(s->s_options);
-	kfree(s);
+fail:
+	destroy_super(s);
+	return NULL;
 }
 
 /* Superblock refcounting  */
-- 
cgit v1.3


From 474279dc0f7745124fc76b474c8dc1294f8e87ce Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 1 Oct 2013 16:11:26 -0400
Subject: split __lookup_mnt() in two functions

Instead of passing the direction as argument (and checking it on every
step through the hash chain), just have separate __lookup_mnt() and
__lookup_mnt_last().  And use the standard iterators...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/mount.h     |  3 ++-
 fs/namei.c     |  4 ++--
 fs/namespace.c | 42 +++++++++++++++++++++++-------------------
 fs/pnode.c     |  6 +++---
 4 files changed, 30 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/mount.h b/fs/mount.h
index 7076f25af35d..f0866076de6e 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -77,7 +77,8 @@ static inline int is_mounted(struct vfsmount *mnt)
 	return !IS_ERR_OR_NULL(real_mount(mnt));
 }
 
-extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
+extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
+extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
 
 static inline void get_mnt_ns(struct mnt_namespace *ns)
 {
diff --git a/fs/namei.c b/fs/namei.c
index 645268f23eb6..1f844fbfce72 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1111,7 +1111,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 		if (!d_mountpoint(path->dentry))
 			break;
 
-		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
+		mounted = __lookup_mnt(path->mnt, path->dentry);
 		if (!mounted)
 			break;
 		path->mnt = &mounted->mnt;
@@ -1132,7 +1132,7 @@ static void follow_mount_rcu(struct nameidata *nd)
 {
 	while (d_mountpoint(nd->path.dentry)) {
 		struct mount *mounted;
-		mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
+		mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
 		if (!mounted)
 			break;
 		nd->path.mnt = &mounted->mnt;
diff --git a/fs/namespace.c b/fs/namespace.c
index 5cbe8cefadb5..500202ce10db 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -548,29 +548,33 @@ static void free_vfsmnt(struct mount *mnt)
 }
 
 /*
- * find the first or last mount at @dentry on vfsmount @mnt depending on
- * @dir. If @dir is set return the first mount else return the last mount.
+ * find the first mount at @dentry on vfsmount @mnt.
  * vfsmount_lock must be held for read or write.
  */
-struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
-			      int dir)
+struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 {
 	struct list_head *head = mount_hashtable + hash(mnt, dentry);
-	struct list_head *tmp = head;
-	struct mount *p, *found = NULL;
+	struct mount *p;
 
-	for (;;) {
-		tmp = dir ? tmp->next : tmp->prev;
-		p = NULL;
-		if (tmp == head)
-			break;
-		p = list_entry(tmp, struct mount, mnt_hash);
-		if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) {
-			found = p;
-			break;
-		}
-	}
-	return found;
+	list_for_each_entry(p, head, mnt_hash)
+		if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
+			return p;
+	return NULL;
+}
+
+/*
+ * find the last mount at @dentry on vfsmount @mnt.
+ * vfsmount_lock must be held for read or write.
+ */
+struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
+{
+	struct list_head *head = mount_hashtable + hash(mnt, dentry);
+	struct mount *p;
+
+	list_for_each_entry_reverse(p, head, mnt_hash)
+		if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
+			return p;
+	return NULL;
 }
 
 /*
@@ -594,7 +598,7 @@ struct vfsmount *lookup_mnt(struct path *path)
 	struct mount *child_mnt;
 
 	br_read_lock(&vfsmount_lock);
-	child_mnt = __lookup_mnt(path->mnt, path->dentry, 1);
+	child_mnt = __lookup_mnt(path->mnt, path->dentry);
 	if (child_mnt) {
 		mnt_add_count(child_mnt, 1);
 		br_read_unlock(&vfsmount_lock);
diff --git a/fs/pnode.c b/fs/pnode.c
index 58933fd149ad..c7221bb19801 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -310,7 +310,7 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
 
 	for (m = propagation_next(parent, parent); m;
 	     		m = propagation_next(m, parent)) {
-		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint, 0);
+		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
 		if (child && list_empty(&child->mnt_mounts) &&
 		    (ret = do_refcount_check(child, 1)))
 			break;
@@ -332,8 +332,8 @@ static void __propagate_umount(struct mount *mnt)
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
 
-		struct mount *child = __lookup_mnt(&m->mnt,
-					mnt->mnt_mountpoint, 0);
+		struct mount *child = __lookup_mnt_last(&m->mnt,
+						mnt->mnt_mountpoint);
 		/*
 		 * umount the child only if the child has no
 		 * other children
-- 
cgit v1.3


From 94e92a6e772e009e27083a7f42d49a179e5f668c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 1 Oct 2013 16:18:06 -0400
Subject: move taking vfsmount_lock down into prepend_path()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 41000305d716..d888223a5486 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2889,6 +2889,7 @@ static int prepend_path(const struct path *path,
 	char *bptr;
 	int blen;
 
+	br_read_lock(&vfsmount_lock);
 	rcu_read_lock();
 restart:
 	bptr = *buffer;
@@ -2935,6 +2936,7 @@ restart:
 		goto restart;
 	}
 	done_seqretry(&rename_lock, seq);
+	br_read_unlock(&vfsmount_lock);
 
 	if (error >= 0 && bptr == *buffer) {
 		if (--blen < 0)
@@ -2971,9 +2973,7 @@ char *__d_path(const struct path *path,
 	int error;
 
 	prepend(&res, &buflen, "\0", 1);
-	br_read_lock(&vfsmount_lock);
 	error = prepend_path(path, root, &res, &buflen);
-	br_read_unlock(&vfsmount_lock);
 
 	if (error < 0)
 		return ERR_PTR(error);
@@ -2990,9 +2990,7 @@ char *d_absolute_path(const struct path *path,
 	int error;
 
 	prepend(&res, &buflen, "\0", 1);
-	br_read_lock(&vfsmount_lock);
 	error = prepend_path(path, &root, &res, &buflen);
-	br_read_unlock(&vfsmount_lock);
 
 	if (error > 1)
 		error = -EINVAL;
@@ -3067,9 +3065,7 @@ char *d_path(const struct path *path, char *buf, int buflen)
 
 	rcu_read_lock();
 	get_fs_root_rcu(current->fs, &root);
-	br_read_lock(&vfsmount_lock);
 	error = path_with_deleted(path, &root, &res, &buflen);
-	br_read_unlock(&vfsmount_lock);
 	rcu_read_unlock();
 
 	if (error < 0)
@@ -3224,7 +3220,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 	get_fs_root_and_pwd_rcu(current->fs, &root, &pwd);
 
 	error = -ENOENT;
-	br_read_lock(&vfsmount_lock);
 	if (!d_unlinked(pwd.dentry)) {
 		unsigned long len;
 		char *cwd = page + PATH_MAX;
@@ -3232,7 +3227,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 
 		prepend(&cwd, &buflen, "\0", 1);
 		error = prepend_path(&pwd, &root, &cwd, &buflen);
-		br_read_unlock(&vfsmount_lock);
 		rcu_read_unlock();
 
 		if (error < 0)
@@ -3253,7 +3247,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 				error = -EFAULT;
 		}
 	} else {
-		br_read_unlock(&vfsmount_lock);
 		rcu_read_unlock();
 	}
 
-- 
cgit v1.3


From b70a80e7a133a0c86f2fa078e7c144597c516415 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 1 Oct 2013 16:44:54 +0200
Subject: vfs: introduce d_instantiate_no_diralias()

...which just returns -EBUSY if a directory alias would be created.

This is to be used by fuse mkdir to make sure that a buggy or malicious
userspace filesystem doesn't do anything nasty.  Previously fuse used a
private mutex for this purpose, which can now go away.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/dcache.c            | 27 +++++++++++++++++++++++++++
 fs/fuse/dir.c          | 40 +++++-----------------------------------
 fs/fuse/fuse_i.h       |  3 ---
 fs/fuse/inode.c        |  2 --
 include/linux/dcache.h |  1 +
 5 files changed, 33 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index d888223a5486..c8e83d0d61ac 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1801,6 +1801,33 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
 
 EXPORT_SYMBOL(d_instantiate_unique);
 
+/**
+ * d_instantiate_no_diralias - instantiate a non-aliased dentry
+ * @entry: dentry to complete
+ * @inode: inode to attach to this dentry
+ *
+ * Fill in inode information in the entry.  If a directory alias is found, then
+ * return an error (and drop inode).  Together with d_materialise_unique() this
+ * guarantees that a directory inode may never have more than one alias.
+ */
+int d_instantiate_no_diralias(struct dentry *entry, struct inode *inode)
+{
+	BUG_ON(!hlist_unhashed(&entry->d_alias));
+
+	spin_lock(&inode->i_lock);
+	if (S_ISDIR(inode->i_mode) && !hlist_empty(&inode->i_dentry)) {
+		spin_unlock(&inode->i_lock);
+		iput(inode);
+		return -EBUSY;
+	}
+	__d_instantiate(entry, inode);
+	spin_unlock(&inode->i_lock);
+	security_d_instantiate(entry, inode);
+
+	return 0;
+}
+EXPORT_SYMBOL(d_instantiate_no_diralias);
+
 struct dentry *d_make_root(struct inode *root_inode)
 {
 	struct dentry *res = NULL;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b7989f2ab4c4..c3eb2c46c8f1 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -342,24 +342,6 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 	return err;
 }
 
-static struct dentry *fuse_materialise_dentry(struct dentry *dentry,
-					      struct inode *inode)
-{
-	struct dentry *newent;
-
-	if (inode && S_ISDIR(inode->i_mode)) {
-		struct fuse_conn *fc = get_fuse_conn(inode);
-
-		mutex_lock(&fc->inst_mutex);
-		newent = d_materialise_unique(dentry, inode);
-		mutex_unlock(&fc->inst_mutex);
-	} else {
-		newent = d_materialise_unique(dentry, inode);
-	}
-
-	return newent;
-}
-
 static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 				  unsigned int flags)
 {
@@ -382,7 +364,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	if (inode && get_node_id(inode) == FUSE_ROOT_ID)
 		goto out_iput;
 
-	newent = fuse_materialise_dentry(entry, inode);
+	newent = d_materialise_unique(entry, inode);
 	err = PTR_ERR(newent);
 	if (IS_ERR(newent))
 		goto out_err;
@@ -601,21 +583,9 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 	}
 	kfree(forget);
 
-	if (S_ISDIR(inode->i_mode)) {
-		struct dentry *alias;
-		mutex_lock(&fc->inst_mutex);
-		alias = d_find_alias(inode);
-		if (alias) {
-			/* New directory must have moved since mkdir */
-			mutex_unlock(&fc->inst_mutex);
-			dput(alias);
-			iput(inode);
-			return -EBUSY;
-		}
-		d_instantiate(entry, inode);
-		mutex_unlock(&fc->inst_mutex);
-	} else
-		d_instantiate(entry, inode);
+	err = d_instantiate_no_diralias(entry, inode);
+	if (err)
+		return err;
 
 	fuse_change_entry_timeout(entry, &outarg);
 	fuse_invalidate_attr(dir);
@@ -1284,7 +1254,7 @@ static int fuse_direntplus_link(struct file *file,
 	if (!inode)
 		goto out;
 
-	alias = fuse_materialise_dentry(dentry, inode);
+	alias = d_materialise_unique(dentry, inode);
 	err = PTR_ERR(alias);
 	if (IS_ERR(alias))
 		goto out;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 5b9e6f3b6aef..fe446c222e3f 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -374,9 +374,6 @@ struct fuse_conn {
 	/** Lock protecting accessess to  members of this structure */
 	spinlock_t lock;
 
-	/** Mutex protecting against directory alias creation */
-	struct mutex inst_mutex;
-
 	/** Refcount */
 	atomic_t count;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index a8ce6dab60a0..1c15613c64f8 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -565,7 +565,6 @@ void fuse_conn_init(struct fuse_conn *fc)
 {
 	memset(fc, 0, sizeof(*fc));
 	spin_lock_init(&fc->lock);
-	mutex_init(&fc->inst_mutex);
 	init_rwsem(&fc->killsb);
 	atomic_set(&fc->count, 1);
 	init_waitqueue_head(&fc->waitq);
@@ -596,7 +595,6 @@ void fuse_conn_put(struct fuse_conn *fc)
 	if (atomic_dec_and_test(&fc->count)) {
 		if (fc->destroy_req)
 			fuse_request_free(fc->destroy_req);
-		mutex_destroy(&fc->inst_mutex);
 		fc->release(fc);
 	}
 }
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 59066e0b4ff1..716c3760ee39 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -224,6 +224,7 @@ static inline int dname_external(const struct dentry *dentry)
 extern void d_instantiate(struct dentry *, struct inode *);
 extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *);
 extern struct dentry * d_materialise_unique(struct dentry *, struct inode *);
+extern int d_instantiate_no_diralias(struct dentry *, struct inode *);
 extern void __d_drop(struct dentry *dentry);
 extern void d_drop(struct dentry *dentry);
 extern void d_delete(struct dentry *);
-- 
cgit v1.3


From e2fec7c35582e7bb41cccc1761faa2af4dc17627 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Oct 2013 17:06:56 -0400
Subject: make freeing super_block rcu-delayed

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c         | 2 +-
 include/linux/fs.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index efa6e488a95c..743bb7118053 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -149,7 +149,7 @@ static void destroy_super(struct super_block *s)
 	WARN_ON(!list_empty(&s->s_mounts));
 	kfree(s->s_subtype);
 	kfree(s->s_options);
-	kfree(s);
+	kfree_rcu(s, rcu);
 }
 
 /**
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b09e4e1d747a..2ab8a67ee054 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1330,6 +1330,7 @@ struct super_block {
 	 */
 	struct list_lru		s_dentry_lru ____cacheline_aligned_in_smp;
 	struct list_lru		s_inode_lru ____cacheline_aligned_in_smp;
+	struct rcu_head		rcu;
 };
 
 extern struct timespec current_fs_time(struct super_block *sb);
-- 
cgit v1.3


From 30687e0a47e89f56489ab73965ee88231e611986 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 3 Oct 2013 12:25:10 -0400
Subject: hpfs: make freeing sbi and codetables rcu-delayed

makes ->d_hash() and ->d_compare() safety in RCU mode independent
from vfsmount_lock

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hpfs/hpfs_fn.h |  1 +
 fs/hpfs/super.c   | 28 ++++++++++++++--------------
 2 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 1b398636e990..6797bf80f6e2 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -80,6 +80,7 @@ struct hpfs_sb_info {
 	unsigned sb_c_bitmap;		/* current bitmap */
 	unsigned sb_max_fwd_alloc;	/* max forwad allocation */
 	int sb_timeshift;
+	struct rcu_head rcu;
 };
 
 /* Four 512-byte buffers and the 2k block obtained by concatenating them */
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 4334cda8dba1..b8d01ef6f531 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -101,18 +101,24 @@ int hpfs_stop_cycles(struct super_block *s, int key, int *c1, int *c2,
 	return 0;
 }
 
-static void hpfs_put_super(struct super_block *s)
+static void free_sbi(struct hpfs_sb_info *sbi)
 {
-	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	kfree(sbi->sb_cp_table);
+	kfree(sbi->sb_bmp_dir);
+	kfree(sbi);
+}
 
+static void lazy_free_sbi(struct rcu_head *rcu)
+{
+	free_sbi(container_of(rcu, struct hpfs_sb_info, rcu));
+}
+
+static void hpfs_put_super(struct super_block *s)
+{
 	hpfs_lock(s);
 	unmark_dirty(s);
 	hpfs_unlock(s);
-
-	kfree(sbi->sb_cp_table);
-	kfree(sbi->sb_bmp_dir);
-	s->s_fs_info = NULL;
-	kfree(sbi);
+	call_rcu(&hpfs_sb(s)->rcu, lazy_free_sbi);
 }
 
 unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
@@ -485,9 +491,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 	}
 	s->s_fs_info = sbi;
 
-	sbi->sb_bmp_dir = NULL;
-	sbi->sb_cp_table = NULL;
-
 	mutex_init(&sbi->hpfs_mutex);
 	hpfs_lock(s);
 
@@ -679,10 +682,7 @@ bail2:	brelse(bh0);
 bail1:
 bail0:
 	hpfs_unlock(s);
-	kfree(sbi->sb_bmp_dir);
-	kfree(sbi->sb_cp_table);
-	s->s_fs_info = NULL;
-	kfree(sbi);
+	free_sbi(sbi);
 	return -EINVAL;
 }
 
-- 
cgit v1.3


From 2d1d9b5b5cc2d7d528a7cbf621d924de38b1b6b6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 3 Oct 2013 12:37:18 -0400
Subject: adfs: delayed freeing of sbi

makes ->d_hash() and ->d_compare() safety in RCU mode independent
from vfsmount_lock.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/adfs/adfs.h  | 9 ++++++---
 fs/adfs/super.c | 3 +--
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 585adafb0cc2..c770337c4b45 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -43,9 +43,12 @@ struct adfs_dir_ops;
  * ADFS file system superblock data in memory
  */
 struct adfs_sb_info {
-	struct adfs_discmap *s_map;	/* bh list containing map		 */
-	struct adfs_dir_ops *s_dir;	/* directory operations			 */
-
+	union { struct {
+		struct adfs_discmap *s_map;	/* bh list containing map	 */
+		struct adfs_dir_ops *s_dir;	/* directory operations		 */
+		};
+		struct rcu_head rcu;		/* used only at shutdown time	 */
+	};
 	kuid_t		s_uid;		/* owner uid				 */
 	kgid_t		s_gid;		/* owner gid				 */
 	umode_t		s_owner_mask;	/* ADFS owner perm -> unix perm		 */
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 0ff4bae2c2a2..7b3003cb6f1b 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -123,8 +123,7 @@ static void adfs_put_super(struct super_block *sb)
 	for (i = 0; i < asb->s_map_size; i++)
 		brelse(asb->s_map[i].dm_bh);
 	kfree(asb->s_map);
-	kfree(asb);
-	sb->s_fs_info = NULL;
+	kfree_rcu(asb, rcu);
 }
 
 static int adfs_show_options(struct seq_file *seq, struct dentry *root)
-- 
cgit v1.3


From baa40671d3e3b590a33b2c0e022db61cbebf5c00 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 3 Oct 2013 12:46:44 -0400
Subject: autofs4: make freeing sbi rcu-delayed

makes ->d_managed() safety in RCU mode independent from vfsmount_lock

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/autofs_i.h |  1 +
 fs/autofs4/inode.c    | 13 ++++---------
 2 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index ebab147e9700..4218e26df916 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -122,6 +122,7 @@ struct autofs_sb_info {
 	spinlock_t lookup_lock;
 	struct list_head active_list;
 	struct list_head expiring_list;
+	struct rcu_head rcu;
 };
 
 static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb)
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index b104726e2d0a..3b9cc9b973c2 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -56,18 +56,13 @@ void autofs4_kill_sb(struct super_block *sb)
 	 * just call kill_anon_super when we are called from
 	 * deactivate_super.
 	 */
-	if (!sbi)
-		goto out_kill_sb;
-
-	/* Free wait queues, close pipe */
-	autofs4_catatonic_mode(sbi);
-
-	sb->s_fs_info = NULL;
-	kfree(sbi);
+	if (sbi) /* Free wait queues, close pipe */
+		autofs4_catatonic_mode(sbi);
 
-out_kill_sb:
 	DPRINTK("shutting down");
 	kill_litter_super(sb);
+	if (sbi)
+		kfree_rcu(sbi, rcu);
 }
 
 static int autofs4_show_options(struct seq_file *m, struct dentry *root)
-- 
cgit v1.3


From 2e32cf5ef2683a196df24ccd2cd2b4ed735ad9d3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 3 Oct 2013 12:53:37 -0400
Subject: cifs: rcu-delay unload_nls() and freeing sbi

makes ->d_hash(), ->d_compare() and ->permission() safety in RCU mode
independent from vfsmount_lock.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/cifs/cifs_fs_sb.h |  1 +
 fs/cifs/connect.c    | 10 ++++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 37e4a72a7d1c..9409fa10bd5c 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -65,5 +65,6 @@ struct cifs_sb_info {
 	char   *mountdata; /* options received at mount time or via DFS refs */
 	struct backing_dev_info bdi;
 	struct delayed_work prune_tlinks;
+	struct rcu_head rcu;
 };
 #endif				/* _CIFS_FS_SB_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a279ffc0bc29..886db8b41e35 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3755,6 +3755,13 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
 	return rc;
 }
 
+static void delayed_free(struct rcu_head *p)
+{
+	struct cifs_sb_info *sbi = container_of(p, struct cifs_sb_info, rcu);
+	unload_nls(sbi->local_nls);
+	kfree(sbi);
+}
+
 void
 cifs_umount(struct cifs_sb_info *cifs_sb)
 {
@@ -3779,8 +3786,7 @@ cifs_umount(struct cifs_sb_info *cifs_sb)
 
 	bdi_destroy(&cifs_sb->bdi);
 	kfree(cifs_sb->mountdata);
-	unload_nls(cifs_sb->local_nls);
-	kfree(cifs_sb);
+	call_rcu(&cifs_sb->rcu, delayed_free);
 }
 
 int
-- 
cgit v1.3


From cac45b062c67f86dc1d91d675128838773523243 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 3 Oct 2013 13:16:50 -0400
Subject: fat: rcu-delay unloading nls and freeing sbi

makes ->d_hash() and ->d_compare() safety in RCU mode independent
from vfsmount_lock.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fat/fat.h   |  1 +
 fs/fat/inode.c | 19 +++++++++++--------
 2 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 4241e6f39e86..7c31f4bc74a9 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -102,6 +102,7 @@ struct msdos_sb_info {
 	struct hlist_head dir_hashtable[FAT_HASH_SIZE];
 
 	unsigned int dirty;           /* fs state before mount */
+	struct rcu_head rcu;
 };
 
 #define FAT_CACHE_VALID	0	/* special case for valid cache */
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 0062da21dd8b..854b578f6695 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -548,6 +548,16 @@ static void fat_set_state(struct super_block *sb,
 	brelse(bh);
 }
 
+static void delayed_free(struct rcu_head *p)
+{
+	struct msdos_sb_info *sbi = container_of(p, struct msdos_sb_info, rcu);
+	unload_nls(sbi->nls_disk);
+	unload_nls(sbi->nls_io);
+	if (sbi->options.iocharset != fat_default_iocharset)
+		kfree(sbi->options.iocharset);
+	kfree(sbi);
+}
+
 static void fat_put_super(struct super_block *sb)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -557,14 +567,7 @@ static void fat_put_super(struct super_block *sb)
 	iput(sbi->fsinfo_inode);
 	iput(sbi->fat_inode);
 
-	unload_nls(sbi->nls_disk);
-	unload_nls(sbi->nls_io);
-
-	if (sbi->options.iocharset != fat_default_iocharset)
-		kfree(sbi->options.iocharset);
-
-	sb->s_fs_info = NULL;
-	kfree(sbi);
+	call_rcu(&sbi->rcu, delayed_free);
 }
 
 static struct kmem_cache *fat_inode_cachep;
-- 
cgit v1.3


From 1dcddd4abd2c6df7f28928ad5cafa4a1cd20030b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 3 Oct 2013 13:22:44 -0400
Subject: ncpfs: rcu-delay unload_nls() and freeing ncp_server

makes ->d_hash() and ->d_compare() safety in RCU mode independent
from vfsmount_lock.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ncpfs/inode.c     | 19 ++++++++++++-------
 fs/ncpfs/ncp_fs_sb.h |  2 +-
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 4659da67e7f6..2cf2ebecb55f 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -782,6 +782,17 @@ out:
 	return error;
 }
 
+static void delayed_free(struct rcu_head *p)
+{
+	struct ncp_server *server = container_of(p, struct ncp_server, rcu);
+#ifdef CONFIG_NCPFS_NLS
+	/* unload the NLS charsets */
+	unload_nls(server->nls_vol);
+	unload_nls(server->nls_io);
+#endif /* CONFIG_NCPFS_NLS */
+	kfree(server);
+}
+
 static void ncp_put_super(struct super_block *sb)
 {
 	struct ncp_server *server = NCP_SBP(sb);
@@ -792,11 +803,6 @@ static void ncp_put_super(struct super_block *sb)
 
 	ncp_stop_tasks(server);
 
-#ifdef CONFIG_NCPFS_NLS
-	/* unload the NLS charsets */
-	unload_nls(server->nls_vol);
-	unload_nls(server->nls_io);
-#endif /* CONFIG_NCPFS_NLS */
 	mutex_destroy(&server->rcv.creq_mutex);
 	mutex_destroy(&server->root_setup_lock);
 	mutex_destroy(&server->mutex);
@@ -813,8 +819,7 @@ static void ncp_put_super(struct super_block *sb)
 	vfree(server->rxbuf);
 	vfree(server->txbuf);
 	vfree(server->packet);
-	sb->s_fs_info = NULL;
-	kfree(server);
+	call_rcu(&server->rcu, delayed_free);
 }
 
 static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
index c51b2c543539..b81e97adc5a9 100644
--- a/fs/ncpfs/ncp_fs_sb.h
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -38,7 +38,7 @@ struct ncp_mount_data_kernel {
 };
 
 struct ncp_server {
-
+	struct rcu_head rcu;
 	struct ncp_mount_data_kernel m;	/* Nearly all of the mount data is of
 					   interest for us later, so we store
 					   it completely. */
-- 
cgit v1.3


From dd3e2c55a45fe75f3213b17b74f9a32380f4e84b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 3 Oct 2013 21:21:39 -0400
Subject: fuse: rcu-delay freeing fuse_conn

makes ->permission() and ->d_revalidate() safety in RCU mode independent
from vfsmount_lock.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fuse/cuse.c   | 2 +-
 fs/fuse/fuse_i.h | 2 ++
 fs/fuse/inode.c  | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index adbfd66b380f..8ca88d716dd3 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -473,7 +473,7 @@ err:
 static void cuse_fc_release(struct fuse_conn *fc)
 {
 	struct cuse_conn *cc = fc_to_cc(fc);
-	kfree(cc);
+	kfree_rcu(cc, fc.rcu);
 }
 
 /**
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index fe446c222e3f..4715674f050c 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -377,6 +377,8 @@ struct fuse_conn {
 	/** Refcount */
 	atomic_t count;
 
+	struct rcu_head rcu;
+
 	/** The user id for this mount */
 	kuid_t user_id;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 1c15613c64f8..d468643a68b2 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -918,7 +918,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 
 static void fuse_free_conn(struct fuse_conn *fc)
 {
-	kfree(fc);
+	kfree_rcu(fc, rcu);
 }
 
 static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
-- 
cgit v1.3


From b9c0622516b73170fa9abffece3079920b78ed6f Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Wed, 23 Oct 2013 21:44:53 +0800
Subject: sysfs: fix sysfs_write_file for bin file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before patch(sysfs: prepare path write for unified regular / bin
file handling), when size of bin file is zero, writting still can
continue, but this patch changes the behaviour.

The worse thing is that firmware loader is broken by this patch,
and user space application can't write to firmware bin file any more
because both firmware loader and drivers can't know at advance how
large the firmware file is and have to set its initialized size as
zero.

This patch fixes the problem and keeps behaviour of writting to bin
as before.

Reported-by: Lothar Waßmann <LW@karo-electronics.de>
Tested-by: Lothar Waßmann <LW@karo-electronics.de>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 5d818df7250b..c3795978b404 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -275,11 +275,10 @@ static ssize_t sysfs_write_file(struct file *file, const char __user *user_buf,
 {
 	struct sysfs_open_file *of = sysfs_of(file);
 	ssize_t len = min_t(size_t, count, PAGE_SIZE);
+	loff_t size = file_inode(file)->i_size;
 	char *buf;
 
-	if (sysfs_is_bin(of->sd)) {
-		loff_t size = file_inode(file)->i_size;
-
+	if (sysfs_is_bin(of->sd) && size) {
 		if (size <= *ppos)
 			return 0;
 		len = min_t(ssize_t, len, size - *ppos);
-- 
cgit v1.3


From dcdfff65276fdc6dfe5eb1d0aff802dfa7a95e15 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Tue, 22 Oct 2013 20:56:10 +0900
Subject: f2fs: clean up several status-related operations

This patch cleans up improper definitions that update some status information.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c |  8 ++------
 fs/f2fs/data.c       | 12 +++---------
 fs/f2fs/f2fs.h       | 18 +++++++++++++++++-
 fs/f2fs/gc.c         |  4 +---
 fs/f2fs/segment.c    | 10 ++++------
 5 files changed, 27 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 6fb484c84ee7..b4a59cf99f1c 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -467,9 +467,7 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
 			return -EEXIST;
 	}
 	list_add_tail(&new->list, head);
-#ifdef CONFIG_F2FS_STAT_FS
-	sbi->n_dirty_dirs++;
-#endif
+	stat_inc_dirty_dir(sbi);
 	return 0;
 }
 
@@ -531,9 +529,7 @@ void remove_dirty_dir_inode(struct inode *inode)
 		if (entry->inode == inode) {
 			list_del(&entry->list);
 			kmem_cache_free(inode_entry_slab, entry);
-#ifdef CONFIG_F2FS_STAT_FS
-			sbi->n_dirty_dirs--;
-#endif
+			stat_dec_dirty_dir(sbi);
 			break;
 		}
 	}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 2535d3b1a763..1cccf98f0e4d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -68,9 +68,6 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
 					struct buffer_head *bh_result)
 {
 	struct f2fs_inode_info *fi = F2FS_I(inode);
-#ifdef CONFIG_F2FS_STAT_FS
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-#endif
 	pgoff_t start_fofs, end_fofs;
 	block_t start_blkaddr;
 
@@ -80,9 +77,8 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
 		return 0;
 	}
 
-#ifdef CONFIG_F2FS_STAT_FS
-	sbi->total_hit_ext++;
-#endif
+	stat_inc_total_hit(inode->i_sb);
+
 	start_fofs = fi->ext.fofs;
 	end_fofs = fi->ext.fofs + fi->ext.len - 1;
 	start_blkaddr = fi->ext.blk_addr;
@@ -100,9 +96,7 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
 		else
 			bh_result->b_size = UINT_MAX;
 
-#ifdef CONFIG_F2FS_STAT_FS
-		sbi->read_hit_ext++;
-#endif
+		stat_inc_read_hit(inode->i_sb);
 		read_unlock(&fi->ext.ext_lock);
 		return 1;
 	}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 29492754bd7d..7ca8c30d8d76 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1155,7 +1155,16 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 	return (struct f2fs_stat_info*)sbi->stat_info;
 }
 
-#define stat_inc_call_count(si)	((si)->call_count++)
+#define stat_inc_call_count(si)		((si)->call_count++)
+#define stat_inc_bggc_count(sbi)	((sbi)->bg_gc++)
+#define stat_inc_dirty_dir(sbi)		((sbi)->n_dirty_dirs++)
+#define stat_dec_dirty_dir(sbi)		((sbi)->n_dirty_dirs--)
+#define stat_inc_total_hit(sb)		((F2FS_SB(sb))->total_hit_ext++)
+#define stat_inc_read_hit(sb)		((F2FS_SB(sb))->read_hit_ext++)
+#define stat_inc_seg_type(sbi, curseg)					\
+		((sbi)->segment_count[(curseg)->alloc_type]++)
+#define stat_inc_block_count(sbi, curseg)				\
+		((sbi)->block_count[(curseg)->alloc_type]++)
 
 #define stat_inc_seg_count(sbi, type)					\
 	do {								\
@@ -1190,6 +1199,13 @@ void __init f2fs_create_root_stats(void);
 void f2fs_destroy_root_stats(void);
 #else
 #define stat_inc_call_count(si)
+#define stat_inc_bggc_count(si)
+#define stat_inc_dirty_dir(sbi)
+#define stat_dec_dirty_dir(sbi)
+#define stat_inc_total_hit(sb)
+#define stat_inc_read_hit(sb)
+#define stat_inc_seg_type(sbi, curseg)
+#define stat_inc_block_count(sbi, curseg)
 #define stat_inc_seg_count(si, type)
 #define stat_inc_tot_blk_count(si, blks)
 #define stat_inc_data_blk_count(si, blks)
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 7914b92a6967..cb286d7b02b2 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -77,9 +77,7 @@ static int gc_thread_func(void *data)
 		else
 			wait_ms = increase_sleep_time(gc_th, wait_ms);
 
-#ifdef CONFIG_F2FS_STAT_FS
-		sbi->bg_gc++;
-#endif
+		stat_inc_bggc_count(sbi);
 
 		/* if return value is not zero, no victim was selected */
 		if (f2fs_gc(sbi))
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 8ac161965242..177a33b16b2c 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -554,9 +554,8 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
 		change_curseg(sbi, type, true);
 	else
 		new_curseg(sbi, type, false);
-#ifdef CONFIG_F2FS_STAT_FS
-	sbi->segment_count[curseg->alloc_type]++;
-#endif
+
+	stat_inc_seg_type(sbi, curseg);
 }
 
 void allocate_new_segments(struct f2fs_sb_info *sbi)
@@ -811,9 +810,8 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
 
 	mutex_lock(&sit_i->sentry_lock);
 	__refresh_next_blkoff(sbi, curseg);
-#ifdef CONFIG_F2FS_STAT_FS
-	sbi->block_count[curseg->alloc_type]++;
-#endif
+
+	stat_inc_block_count(sbi, curseg);
 
 	/*
 	 * SIT information should be updated before segment allocation,
-- 
cgit v1.3


From aabe51364f44681cbd83fb1c27ef7d3dbe567c45 Mon Sep 17 00:00:00 2001
From: Haicheng Li <haicheng.li@linux.intel.com>
Date: Wed, 23 Oct 2013 12:39:32 +0800
Subject: f2fs: use bool for booleans

Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 4 ++--
 fs/f2fs/f2fs.h       | 4 ++--
 fs/f2fs/node.c       | 4 ++--
 fs/f2fs/recovery.c   | 8 ++++----
 fs/f2fs/super.c      | 2 +-
 5 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index b4a59cf99f1c..5649a9d8e942 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -275,7 +275,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 	if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
 		return 0;
 
-	sbi->por_doing = 1;
+	sbi->por_doing = true;
 	start_blk = __start_cp_addr(sbi) + 1;
 	orphan_blkaddr = __start_sum_addr(sbi) - 1;
 
@@ -292,7 +292,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 	}
 	/* clear Orphan Flag */
 	clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
-	sbi->por_doing = 0;
+	sbi->por_doing = false;
 	return 0;
 }
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 7ca8c30d8d76..e8f3fd345ca3 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -361,8 +361,8 @@ struct f2fs_sb_info {
 	struct rw_semaphore cp_rwsem;		/* blocking FS operations */
 	struct mutex node_write;		/* locking node writes */
 	struct mutex writepages;		/* mutex for writepages() */
-	int por_doing;				/* recovery is doing or not */
-	int on_build_free_nids;			/* build_free_nids is doing */
+	bool por_doing;				/* recovery is doing or not */
+	bool on_build_free_nids;		/* build_free_nids is doing */
 	struct task_struct *cp_task;		/* checkpoint task */
 
 	/* for orphan inode management */
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 4fa3fd5def34..cc119b65a0d3 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1439,9 +1439,9 @@ retry:
 
 	/* Let's scan nat pages and its caches to get free nids */
 	mutex_lock(&nm_i->build_lock);
-	sbi->on_build_free_nids = 1;
+	sbi->on_build_free_nids = true;
 	build_free_nids(sbi);
-	sbi->on_build_free_nids = 0;
+	sbi->on_build_free_nids = false;
 	mutex_unlock(&nm_i->build_lock);
 	goto retry;
 }
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 353cf4f66c7b..b278c68b3e08 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -425,7 +425,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
 {
 	struct list_head inode_list;
 	int err;
-	int need_writecp = 0;
+	bool need_writecp = false;
 
 	fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
 			sizeof(struct fsync_inode_entry), NULL);
@@ -435,7 +435,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
 	INIT_LIST_HEAD(&inode_list);
 
 	/* step #1: find fsynced inode numbers */
-	sbi->por_doing = 1;
+	sbi->por_doing = true;
 	err = find_fsync_dnodes(sbi, &inode_list);
 	if (err)
 		goto out;
@@ -443,7 +443,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
 	if (list_empty(&inode_list))
 		goto out;
 
-	need_writecp = 1;
+	need_writecp = true;
 
 	/* step #2: recover data */
 	err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
@@ -451,7 +451,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
 out:
 	destroy_fsync_dnodes(&inode_list);
 	kmem_cache_destroy(fsync_entry_slab);
-	sbi->por_doing = 0;
+	sbi->por_doing = false;
 	if (!err && need_writecp)
 		write_checkpoint(sbi, false);
 	return err;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 692f35fbae1a..9a094596fd21 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -850,7 +850,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 	mutex_init(&sbi->writepages);
 	mutex_init(&sbi->cp_mutex);
 	mutex_init(&sbi->node_write);
-	sbi->por_doing = 0;
+	sbi->por_doing = false;
 	spin_lock_init(&sbi->stat_lock);
 	init_rwsem(&sbi->bio_sem);
 	init_rwsem(&sbi->cp_rwsem);
-- 
cgit v1.3


From 81eb8d6e2869b119d4a7b8c02091c3779733a3ac Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Thu, 24 Oct 2013 13:31:34 +0900
Subject: f2fs: reclaim prefree segments periodically

Previously, f2fs postpones reclaiming prefree segments into free segments
as much as possible.
However, if user writes and deletes a bunch of data without any sync or fsync
calls, some flash storages can suffer from garbage collections.

So, this patch adds the reclaiming codes to f2fs_write_node_pages and background
GC thread.

If there are a lot of prefree segments, let's do checkpoint so that f2fs
submits discard commands for the prefree regions to the flash storage.

Signed-off-by: Changman Lee <cm224.lee@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/f2fs.h    | 3 +++
 fs/f2fs/gc.c      | 5 +++++
 fs/f2fs/node.c    | 3 ++-
 fs/f2fs/segment.c | 1 +
 fs/f2fs/segment.h | 7 +++++++
 5 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index e8f3fd345ca3..6dff777cd94a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -299,6 +299,9 @@ struct f2fs_sm_info {
 	unsigned int main_segments;	/* # of segments in main area */
 	unsigned int reserved_segments;	/* # of reserved segments */
 	unsigned int ovp_segments;	/* # of overprovision segments */
+
+	/* a threshold to reclaim prefree segments */
+	unsigned int rec_prefree_segments;
 };
 
 /*
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index cb286d7b02b2..783c6cc6253c 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -82,6 +82,11 @@ static int gc_thread_func(void *data)
 		/* if return value is not zero, no victim was selected */
 		if (f2fs_gc(sbi))
 			wait_ms = gc_th->no_gc_sleep_time;
+
+		/* balancing prefree segments */
+		if (excess_prefree_segs(sbi))
+			f2fs_sync_fs(sbi->sb, true);
+
 	} while (!kthread_should_stop());
 	return 0;
 }
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index cc119b65a0d3..89dd8a5a3ca3 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1206,7 +1206,8 @@ static int f2fs_write_node_pages(struct address_space *mapping,
 	long nr_to_write = wbc->nr_to_write;
 
 	/* First check balancing cached NAT entries */
-	if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) {
+	if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) ||
+				excess_prefree_segs(sbi)) {
 		f2fs_sync_fs(sbi->sb, true);
 		return 0;
 	}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 177a33b16b2c..62b52f2f293a 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1645,6 +1645,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
 	sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
 	sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main);
 	sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
+	sm_info->rec_prefree_segments = DEF_RECLAIM_PREFREE_SEGMENTS;
 
 	err = build_sit_info(sbi);
 	if (err)
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 7f94d78cda3d..abe7094c4f7a 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -14,6 +14,8 @@
 #define NULL_SEGNO			((unsigned int)(~0))
 #define NULL_SECNO			((unsigned int)(~0))
 
+#define DEF_RECLAIM_PREFREE_SEGMENTS	100	/* 200MB of prefree segments */
+
 /* L: Logical segment # in volume, R: Relative segment # in main area */
 #define GET_L2R_SEGNO(free_i, segno)	(segno - free_i->start_segno)
 #define GET_R2L_SEGNO(free_i, segno)	(segno + free_i->start_segno)
@@ -472,6 +474,11 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
 						reserved_sections(sbi)));
 }
 
+static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi)
+{
+	return (prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments);
+}
+
 static inline int utilization(struct f2fs_sb_info *sbi)
 {
 	return div_u64((u64)valid_user_blocks(sbi) * 100, sbi->user_block_count);
-- 
cgit v1.3


From 4660f9c0fe484353b17a4b9d1cc2b036fa895f76 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Thu, 24 Oct 2013 14:19:18 +0900
Subject: f2fs: introduce f2fs_balance_fs_bg for some background jobs

This patch merges some background jobs into this new function.

Signed-off-by: Changman Lee <cm224.lee@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/f2fs.h    |  2 ++
 fs/f2fs/gc.c      |  5 ++---
 fs/f2fs/node.c    | 10 +++-------
 fs/f2fs/segment.c |  8 ++++++++
 4 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 6dff777cd94a..a61cc5f84638 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -965,6 +965,7 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
  */
 void f2fs_set_inode_flags(struct inode *);
 struct inode *f2fs_iget(struct super_block *, unsigned long);
+int try_to_free_nats(struct f2fs_sb_info *, int);
 void update_inode(struct inode *, struct page *);
 int update_inode_page(struct inode *);
 int f2fs_write_inode(struct inode *, struct writeback_control *);
@@ -1045,6 +1046,7 @@ void destroy_node_manager_caches(void);
  * segment.c
  */
 void f2fs_balance_fs(struct f2fs_sb_info *);
+void f2fs_balance_fs_bg(struct f2fs_sb_info *);
 void invalidate_blocks(struct f2fs_sb_info *, block_t);
 void clear_prefree_segments(struct f2fs_sb_info *);
 int npages_for_summary_flush(struct f2fs_sb_info *);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 783c6cc6253c..b7ad1ec7e4cc 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -83,9 +83,8 @@ static int gc_thread_func(void *data)
 		if (f2fs_gc(sbi))
 			wait_ms = gc_th->no_gc_sleep_time;
 
-		/* balancing prefree segments */
-		if (excess_prefree_segs(sbi))
-			f2fs_sync_fs(sbi->sb, true);
+		/* balancing f2fs's metadata periodically */
+		f2fs_balance_fs_bg(sbi);
 
 	} while (!kthread_should_stop());
 	return 0;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 89dd8a5a3ca3..7bac4815f1d6 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -240,7 +240,7 @@ retry:
 	write_unlock(&nm_i->nat_tree_lock);
 }
 
-static int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
+int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 
@@ -1205,12 +1205,8 @@ static int f2fs_write_node_pages(struct address_space *mapping,
 	struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
 	long nr_to_write = wbc->nr_to_write;
 
-	/* First check balancing cached NAT entries */
-	if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) ||
-				excess_prefree_segs(sbi)) {
-		f2fs_sync_fs(sbi->sb, true);
-		return 0;
-	}
+	/* balancing f2fs's metadata in background */
+	f2fs_balance_fs_bg(sbi);
 
 	/* collect a number of dirty node pages and write together */
 	if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES)
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 62b52f2f293a..8e5ca2869913 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -36,6 +36,14 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi)
 	}
 }
 
+void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
+{
+	/* check the # of cached NAT entries and prefree segments */
+	if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) ||
+				excess_prefree_segs(sbi))
+		f2fs_sync_fs(sbi->sb, true);
+}
+
 static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
 		enum dirty_type dirty_type)
 {
-- 
cgit v1.3


From ea91e9b043057d29e092dd4d0a13cd79b78fb5b5 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Thu, 24 Oct 2013 15:49:07 +0900
Subject: f2fs: add reclaiming control by sysfs

This patch adds a control method in sysfs to reclaim prefree segments.

Signed-off-by: Changman Lee <cm224.lee@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 Documentation/filesystems/f2fs.txt |  6 +++++
 fs/f2fs/super.c                    | 50 +++++++++++++++++++++++++++-----------
 2 files changed, 42 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index 3cd27bed6349..4c647c280dcc 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -164,6 +164,12 @@ Files in /sys/fs/f2fs/<devname>
                               gc_idle = 1 will select the Cost Benefit approach
                               & setting gc_idle = 2 will select the greedy aproach.
 
+ reclaim_segments             This parameter controls the number of prefree
+                              segments to be reclaimed. If the number of prefree
+			      segments is larger than this number, f2fs tries to
+			      conduct checkpoint to reclaim the prefree segments
+			      to free segments. By default, 100 segments, 200MB.
+
 ================================================================================
 USAGE
 ================================================================================
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 9a094596fd21..e42351cbe166 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -69,24 +69,40 @@ static match_table_t f2fs_tokens = {
 };
 
 /* Sysfs support for f2fs */
+enum {
+	GC_THREAD,	/* struct f2fs_gc_thread */
+	SM_INFO,	/* struct f2fs_sm_info */
+};
+
 struct f2fs_attr {
 	struct attribute attr;
 	ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *);
 	ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *,
 			 const char *, size_t);
+	int struct_type;
 	int offset;
 };
 
+static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
+{
+	if (struct_type == GC_THREAD)
+		return (unsigned char *)sbi->gc_thread;
+	else if (struct_type == SM_INFO)
+		return (unsigned char *)SM_I(sbi);
+	return NULL;
+}
+
 static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
 			struct f2fs_sb_info *sbi, char *buf)
 {
-	struct f2fs_gc_kthread *gc_kth = sbi->gc_thread;
+	unsigned char *ptr = NULL;
 	unsigned int *ui;
 
-	if (!gc_kth)
+	ptr = __struct_ptr(sbi, a->struct_type);
+	if (!ptr)
 		return -EINVAL;
 
-	ui = (unsigned int *)(((char *)gc_kth) + a->offset);
+	ui = (unsigned int *)(ptr + a->offset);
 
 	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
 }
@@ -95,15 +111,16 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
 			struct f2fs_sb_info *sbi,
 			const char *buf, size_t count)
 {
-	struct f2fs_gc_kthread *gc_kth = sbi->gc_thread;
+	unsigned char *ptr;
 	unsigned long t;
 	unsigned int *ui;
 	ssize_t ret;
 
-	if (!gc_kth)
+	ptr = __struct_ptr(sbi, a->struct_type);
+	if (!ptr)
 		return -EINVAL;
 
-	ui = (unsigned int *)(((char *)gc_kth) + a->offset);
+	ui = (unsigned int *)(ptr + a->offset);
 
 	ret = kstrtoul(skip_spaces(buf), 0, &t);
 	if (ret < 0)
@@ -139,21 +156,25 @@ static void f2fs_sb_release(struct kobject *kobj)
 	complete(&sbi->s_kobj_unregister);
 }
 
-#define F2FS_ATTR_OFFSET(_name, _mode, _show, _store, _elname) \
+#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \
 static struct f2fs_attr f2fs_attr_##_name = {			\
 	.attr = {.name = __stringify(_name), .mode = _mode },	\
 	.show	= _show,					\
 	.store	= _store,					\
-	.offset = offsetof(struct f2fs_gc_kthread, _elname),	\
+	.struct_type = _struct_type,				\
+	.offset = _offset					\
 }
 
-#define F2FS_RW_ATTR(name, elname)	\
-	F2FS_ATTR_OFFSET(name, 0644, f2fs_sbi_show, f2fs_sbi_store, elname)
+#define F2FS_RW_ATTR(struct_type, struct_name, name, elname)	\
+	F2FS_ATTR_OFFSET(struct_type, name, 0644,		\
+		f2fs_sbi_show, f2fs_sbi_store,			\
+		offsetof(struct struct_name, elname))
 
-F2FS_RW_ATTR(gc_min_sleep_time, min_sleep_time);
-F2FS_RW_ATTR(gc_max_sleep_time, max_sleep_time);
-F2FS_RW_ATTR(gc_no_gc_sleep_time, no_gc_sleep_time);
-F2FS_RW_ATTR(gc_idle, gc_idle);
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time);
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time);
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
 
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -161,6 +182,7 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(gc_max_sleep_time),
 	ATTR_LIST(gc_no_gc_sleep_time),
 	ATTR_LIST(gc_idle),
+	ATTR_LIST(reclaim_segments),
 	NULL,
 };
 
-- 
cgit v1.3


From e8d61a7488d06aba3e7226e3536a6a6e14391ce8 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Thu, 24 Oct 2013 15:08:28 +0800
Subject: f2fs: remove redundant set_page_dirty from write_compacted_summaries

Previously, set_page_dirty is called every time after writting one summary info
into compacted summary page,
To avoid redundant set_page_dirty, we only call set_page_dirty before release
page.

Signed-off-by: Yu Chao <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/segment.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 8e5ca2869913..487af619aeda 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1138,8 +1138,6 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
 						SUM_JOURNAL_SIZE);
 	written_size += SUM_JOURNAL_SIZE;
 
-	set_page_dirty(page);
-
 	/* Step 3: write summary entries */
 	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
 		unsigned short blkoff;
@@ -1158,18 +1156,20 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
 			summary = (struct f2fs_summary *)(kaddr + written_size);
 			*summary = seg_i->sum_blk->entries[j];
 			written_size += SUMMARY_SIZE;
-			set_page_dirty(page);
 
 			if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
 							SUM_FOOTER_SIZE)
 				continue;
 
+			set_page_dirty(page);
 			f2fs_put_page(page, 1);
 			page = NULL;
 		}
 	}
-	if (page)
+	if (page) {
+		set_page_dirty(page);
 		f2fs_put_page(page, 1);
+	}
 }
 
 static void write_normal_summaries(struct f2fs_sb_info *sbi,
-- 
cgit v1.3


From 26c6b8879911df991dc780c67eaeb84c7629949d Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Thu, 24 Oct 2013 17:53:29 +0900
Subject: f2fs: add tracepoint for set_page_dirty

This patch adds a tracepoint for set_page_dirty.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c        |  2 ++
 fs/f2fs/data.c              |  2 ++
 fs/f2fs/node.c              |  2 ++
 include/trace/events/f2fs.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 50 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 5649a9d8e942..6526f5010621 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -167,6 +167,8 @@ static int f2fs_set_meta_page_dirty(struct page *page)
 	struct address_space *mapping = page->mapping;
 	struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
 
+	trace_f2fs_set_page_dirty(page, META);
+
 	SetPageUptodate(page);
 	if (!PageDirty(page)) {
 		__set_page_dirty_nobuffers(page);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 1cccf98f0e4d..c8887d847dcf 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -756,6 +756,8 @@ static int f2fs_set_data_page_dirty(struct page *page)
 	struct address_space *mapping = page->mapping;
 	struct inode *inode = mapping->host;
 
+	trace_f2fs_set_page_dirty(page, DATA);
+
 	SetPageUptodate(page);
 	if (!PageDirty(page)) {
 		__set_page_dirty_nobuffers(page);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 7bac4815f1d6..304d5ced6355 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1225,6 +1225,8 @@ static int f2fs_set_node_page_dirty(struct page *page)
 	struct address_space *mapping = page->mapping;
 	struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
 
+	trace_f2fs_set_page_dirty(page, NODE);
+
 	SetPageUptodate(page);
 	if (!PageDirty(page)) {
 		__set_page_dirty_nobuffers(page);
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 52ae54828eda..ebde3afb89cd 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -36,6 +36,11 @@
 		{ CURSEG_COLD_NODE, 	"Cold NODE" },			\
 		{ NO_CHECK_TYPE, 	"No TYPE" })
 
+#define show_file_type(type)						\
+	__print_symbolic(type,						\
+		{ 0,		"FILE" },				\
+		{ 1,		"DIR" })
+
 #define show_gc_type(type)						\
 	__print_symbolic(type,						\
 		{ FG_GC,	"Foreground GC" },			\
@@ -623,6 +628,45 @@ TRACE_EVENT(f2fs_do_submit_bio,
 		__entry->size)
 );
 
+DECLARE_EVENT_CLASS(f2fs__page,
+
+	TP_PROTO(struct page *page, int type),
+
+	TP_ARGS(page, type),
+
+	TP_STRUCT__entry(
+		__field(dev_t,	dev)
+		__field(ino_t,	ino)
+		__field(int, type)
+		__field(int, dir)
+		__field(pgoff_t, index)
+		__field(int, dirty)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= page->mapping->host->i_sb->s_dev;
+		__entry->ino	= page->mapping->host->i_ino;
+		__entry->type	= type;
+		__entry->dir	= S_ISDIR(page->mapping->host->i_mode);
+		__entry->index	= page->index;
+		__entry->dirty	= PageDirty(page);
+	),
+
+	TP_printk("dev = (%d,%d), ino = %lu, %s, %s, index = %lu, dirty = %d",
+		show_dev_ino(__entry),
+		show_block_type(__entry->type),
+		show_file_type(__entry->dir),
+		(unsigned long)__entry->index,
+		__entry->dirty)
+);
+
+DEFINE_EVENT(f2fs__page, f2fs_set_page_dirty,
+
+	TP_PROTO(struct page *page, int type),
+
+	TP_ARGS(page, type)
+);
+
 TRACE_EVENT(f2fs_submit_write_page,
 
 	TP_PROTO(struct page *page, block_t blk_addr, int type),
-- 
cgit v1.3


From e943a10d94f6076f7bc8da91828cbdca5fbf62fc Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Fri, 25 Oct 2013 14:26:31 +0900
Subject: f2fs: add tracepoint for vm_page_mkwrite

This patch adds a tracepoint for f2fs_vm_page_mkwrite.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/file.c              | 1 +
 include/trace/events/f2fs.h | 7 +++++++
 2 files changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index c80faa2fff39..2d4190a4abe3 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -88,6 +88,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 	set_page_dirty(page);
 	SetPageUptodate(page);
 
+	trace_f2fs_vm_page_mkwrite(page, DATA);
 mapped:
 	/* fill the page */
 	wait_on_page_writeback(page);
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index ebde3afb89cd..e0dc355fa317 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -667,6 +667,13 @@ DEFINE_EVENT(f2fs__page, f2fs_set_page_dirty,
 	TP_ARGS(page, type)
 );
 
+DEFINE_EVENT(f2fs__page, f2fs_vm_page_mkwrite,
+
+	TP_PROTO(struct page *page, int type),
+
+	TP_ARGS(page, type)
+);
+
 TRACE_EVENT(f2fs_submit_write_page,
 
 	TP_PROTO(struct page *page, block_t blk_addr, int type),
-- 
cgit v1.3


From 7203db97b7378c2571797c13aa89327a2c487ea1 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Fri, 7 Jun 2013 10:17:22 +0800
Subject: UBIFS: fix return code

Fix to return -ENOMEM in the kmalloc() and d_make_root() error handling
case instead of 0, as done elsewhere in those functions.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
---
 fs/ubifs/super.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 879b9976c12b..01e1ad08d08e 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1629,8 +1629,10 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 	}
 
 	c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL);
-	if (!c->write_reserve_buf)
+	if (!c->write_reserve_buf) {
+		err = -ENOMEM;
 		goto out;
+	}
 
 	err = ubifs_lpt_init(c, 0, 1);
 	if (err)
@@ -2063,8 +2065,10 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	sb->s_root = d_make_root(root);
-	if (!sb->s_root)
+	if (!sb->s_root) {
+		err = -ENOMEM;
 		goto out_umount;
+	}
 
 	mutex_unlock(&c->umount_mutex);
 	return 0;
-- 
cgit v1.3


From 58a4e23703b22c331b01fbd0c12161aadaa6d50b Mon Sep 17 00:00:00 2001
From: Mats Kärrman <Mats.Karrman@tritech.se>
Date: Wed, 21 Aug 2013 13:24:49 +0000
Subject: UBIFS: correct data corruption range

With power-cut emulation, it is possible that sometimes no data at all is
corrupted and that confusing messages are printed due to errors in the
computation of data corruption range.

[1] The start of the range should be [0..len-1], not [0..len].
[2] The end of the range should always be at least 1 greater than the start.

Signed-off-by: Mats Karrman <mats.karrman@tritech.se>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
---
 fs/ubifs/debug.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 6e025e02ffde..cc1febd8fadf 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2563,9 +2563,9 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf,
 	unsigned int from, to, ffs = chance(1, 2);
 	unsigned char *p = (void *)buf;
 
-	from = prandom_u32() % (len + 1);
-	/* Corruption may only span one max. write unit */
-	to = min(len, ALIGN(from, c->max_write_size));
+	from = prandom_u32() % len;
+	/* Corruption span max to end of write unit */
+	to = min(len, ALIGN(from + 1, c->max_write_size));
 
 	ubifs_warn("filled bytes %u-%u with %s", from, to - 1,
 		   ffs ? "0xFFs" : "random data");
-- 
cgit v1.3


From cce6de908ee934fbea12d6cccc55b87dbd4b0c8a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 15 Oct 2013 12:33:22 -0700
Subject: nfsd: fix Kconfig syntax

The description text for CONFIG_NFSD_V4_SECURITY_LABEL has an unpaired
quote sign which breaks syntax highlighting for the nfsd Kconfig file.
Remove it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index dc8f1ef665ce..f994e750e0d1 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -95,7 +95,7 @@ config NFSD_V4_SECURITY_LABEL
 	Smack policies on NFSv4 files, say N.
 
 	WARNING: there is still a chance of backwards-incompatible protocol changes.
-	For now we recommend "Y" only for developers and testers."
+	For now we recommend "Y" only for developers and testers.
 
 config NFSD_FAULT_INJECTION
 	bool "NFS server manual fault injection"
-- 
cgit v1.3


From e104f1e9dab6726187810f5d9e06cadb946d4a61 Mon Sep 17 00:00:00 2001
From: Huang Shijie <b32955@freescale.com>
Date: Wed, 25 Sep 2013 14:58:20 +0800
Subject: jffs2: do not support the MLC nand

We should not support the MLC nand for jffs2. So if the nand type is
MLC, we quit immediatly.

Signed-off-by: Huang Shijie <b32955@freescale.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 fs/jffs2/fs.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index fe3c0527545f..09b3ed455724 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -515,6 +515,10 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
 
 	c = JFFS2_SB_INFO(sb);
 
+	/* Do not support the MLC nand */
+	if (c->mtd->type == MTD_MLCNANDFLASH)
+		return -EINVAL;
+
 #ifndef CONFIG_JFFS2_FS_WRITEBUFFER
 	if (c->mtd->type == MTD_NANDFLASH) {
 		pr_err("Cannot operate on NAND flash unless jffs2 NAND support is compiled in\n");
-- 
cgit v1.3


From 4625d6aac2d00a18f7bcc15bffe41e9de3a25332 Mon Sep 17 00:00:00 2001
From: Changman Lee <cm224.lee@samsung.com>
Date: Fri, 25 Oct 2013 17:31:57 +0900
Subject: f2fs: remove unnecessary segment bitmap updates

Only one dirty type is set in __locate_dirty_segment and we can know
dirty type of segment. So we don't need to check other dirty types.

Signed-off-by: Changman Lee <cm224.lee@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/segment.c | 31 ++++++++-----------------------
 1 file changed, 8 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 487af619aeda..8f92c18a01fb 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -58,20 +58,10 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
 
 	if (dirty_type == DIRTY) {
 		struct seg_entry *sentry = get_seg_entry(sbi, segno);
-		enum dirty_type t = DIRTY_HOT_DATA;
+		enum dirty_type t = sentry->type;
 
-		dirty_type = sentry->type;
-
-		if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type]))
-			dirty_i->nr_dirty[dirty_type]++;
-
-		/* Only one bitmap should be set */
-		for (; t <= DIRTY_COLD_NODE; t++) {
-			if (t == dirty_type)
-				continue;
-			if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
-				dirty_i->nr_dirty[t]--;
-		}
+		if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t]))
+			dirty_i->nr_dirty[t]++;
 	}
 }
 
@@ -84,16 +74,11 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
 		dirty_i->nr_dirty[dirty_type]--;
 
 	if (dirty_type == DIRTY) {
-		enum dirty_type t = DIRTY_HOT_DATA;
-
-		/* clear its dirty bitmap */
-		for (; t <= DIRTY_COLD_NODE; t++) {
-			if (test_and_clear_bit(segno,
-						dirty_i->dirty_segmap[t])) {
-				dirty_i->nr_dirty[t]--;
-				break;
-			}
-		}
+		struct seg_entry *sentry = get_seg_entry(sbi, segno);
+		enum dirty_type t = sentry->type;
+
+		if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
+			dirty_i->nr_dirty[t]--;
 
 		if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0)
 			clear_bit(GET_SECNO(sbi, segno),
-- 
cgit v1.3


From b8b60e1a65893728ca868493cb4c4b64b55e7f9c Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Mon, 28 Oct 2013 13:12:09 +0900
Subject: f2fs: clean up acl flow for better readability

This patch cleans up a couple of acl codes.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/acl.c | 25 +++++++++++++------------
 fs/f2fs/acl.h |  6 +++---
 2 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index b7826ec1b470..f1a697584651 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -262,8 +262,8 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 
 int f2fs_init_acl(struct inode *inode, struct inode *dir)
 {
-	struct posix_acl *acl = NULL;
 	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+	struct posix_acl *acl = NULL;
 	int error = 0;
 
 	if (!S_ISLNK(inode->i_mode)) {
@@ -276,19 +276,19 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir)
 			inode->i_mode &= ~current_umask();
 	}
 
-	if (test_opt(sbi, POSIX_ACL) && acl) {
+	if (!test_opt(sbi, POSIX_ACL) || !acl)
+		goto cleanup;
 
-		if (S_ISDIR(inode->i_mode)) {
-			error = f2fs_set_acl(inode, ACL_TYPE_DEFAULT, acl);
-			if (error)
-				goto cleanup;
-		}
-		error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
-		if (error < 0)
-			return error;
-		if (error > 0)
-			error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl);
+	if (S_ISDIR(inode->i_mode)) {
+		error = f2fs_set_acl(inode, ACL_TYPE_DEFAULT, acl);
+		if (error)
+			goto cleanup;
 	}
+	error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
+	if (error < 0)
+		return error;
+	if (error > 0)
+		error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl);
 cleanup:
 	posix_acl_release(acl);
 	return error;
@@ -313,6 +313,7 @@ int f2fs_acl_chmod(struct inode *inode)
 	error = posix_acl_chmod(&acl, GFP_KERNEL, mode);
 	if (error)
 		return error;
+
 	error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl);
 	posix_acl_release(acl);
 	return error;
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index 80f430674417..3839048f2e96 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -36,9 +36,9 @@ struct f2fs_acl_header {
 
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
 
-extern struct posix_acl *f2fs_get_acl(struct inode *inode, int type);
-extern int f2fs_acl_chmod(struct inode *inode);
-extern int f2fs_init_acl(struct inode *inode, struct inode *dir);
+extern struct posix_acl *f2fs_get_acl(struct inode *, int);
+extern int f2fs_acl_chmod(struct inode *);
+extern int f2fs_init_acl(struct inode *, struct inode *);
 #else
 #define f2fs_check_acl	NULL
 #define f2fs_get_acl	NULL
-- 
cgit v1.3


From 2ed2d5b33cb564025b1eb90650d70a0a3592c0e3 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Mon, 28 Oct 2013 13:17:54 +0900
Subject: f2fs: fix a deadlock during init_acl procedure

The deadlock is found through the following scenario.

sys_mkdir()
 -> f2fs_add_link()
  -> __f2fs_add_link()
   -> init_inode_metadata()
     : lock_page(inode);
    -> f2fs_init_acl()
     -> f2fs_set_acl()
      -> f2fs_setxattr(..., NULL)
       : This NULL page incurs a deadlock at update_inode_page().

So, likewise f2fs_init_security(), this patch adds a parameter to transfer the
locked inode page to f2fs_setxattr().

Found by Linux File System Verification project (linuxtesting.org).

Reported-by: Alexey Khoroshilov <khoroshilov@ispras.ru>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/acl.c | 15 ++++++++-------
 fs/f2fs/acl.h |  5 +++--
 fs/f2fs/dir.c |  2 +-
 3 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index f1a697584651..d0fc287efeff 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -205,7 +205,8 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
 	return acl;
 }
 
-static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+static int f2fs_set_acl(struct inode *inode, int type,
+			struct posix_acl *acl, struct page *ipage)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -250,7 +251,7 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 		}
 	}
 
-	error = f2fs_setxattr(inode, name_index, "", value, size, NULL);
+	error = f2fs_setxattr(inode, name_index, "", value, size, ipage);
 
 	kfree(value);
 	if (!error)
@@ -260,7 +261,7 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	return error;
 }
 
-int f2fs_init_acl(struct inode *inode, struct inode *dir)
+int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
 	struct posix_acl *acl = NULL;
@@ -280,7 +281,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir)
 		goto cleanup;
 
 	if (S_ISDIR(inode->i_mode)) {
-		error = f2fs_set_acl(inode, ACL_TYPE_DEFAULT, acl);
+		error = f2fs_set_acl(inode, ACL_TYPE_DEFAULT, acl, ipage);
 		if (error)
 			goto cleanup;
 	}
@@ -288,7 +289,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir)
 	if (error < 0)
 		return error;
 	if (error > 0)
-		error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl);
+		error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, ipage);
 cleanup:
 	posix_acl_release(acl);
 	return error;
@@ -314,7 +315,7 @@ int f2fs_acl_chmod(struct inode *inode)
 	if (error)
 		return error;
 
-	error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl);
+	error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, NULL);
 	posix_acl_release(acl);
 	return error;
 }
@@ -389,7 +390,7 @@ static int f2fs_xattr_set_acl(struct dentry *dentry, const char *name,
 		acl = NULL;
 	}
 
-	error = f2fs_set_acl(inode, type, acl);
+	error = f2fs_set_acl(inode, type, acl, NULL);
 
 release_and_out:
 	posix_acl_release(acl);
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index 3839048f2e96..49633131e038 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -38,7 +38,7 @@ struct f2fs_acl_header {
 
 extern struct posix_acl *f2fs_get_acl(struct inode *, int);
 extern int f2fs_acl_chmod(struct inode *);
-extern int f2fs_init_acl(struct inode *, struct inode *);
+extern int f2fs_init_acl(struct inode *, struct inode *, struct page *);
 #else
 #define f2fs_check_acl	NULL
 #define f2fs_get_acl	NULL
@@ -49,7 +49,8 @@ static inline int f2fs_acl_chmod(struct inode *inode)
 	return 0;
 }
 
-static inline int f2fs_init_acl(struct inode *inode, struct inode *dir)
+static inline int f2fs_init_acl(struct inode *inode, struct inode *dir,
+							struct page *page)
 {
 	return 0;
 }
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 384c6daf9a89..c9d53fc510c5 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -346,7 +346,7 @@ static struct page *init_inode_metadata(struct inode *inode,
 				goto error;
 		}
 
-		err = f2fs_init_acl(inode, dir);
+		err = f2fs_init_acl(inode, dir, page);
 		if (err)
 			goto error;
 
-- 
cgit v1.3


From 7ff8d45c9dccf0744404d6fe44468ede7c1b9533 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Mon, 14 Oct 2013 00:44:19 -0500
Subject: Fix corrupt SMB2 ioctl requests

We were off by one calculating the length of ioctls in some cases
because the protocol specification for SMB2 ioctl includes a mininum
one byte payload but not all SMB2 ioctl requests actually have
a data buffer to send. We were also not zeroing out the
return buffer (in case of error this is helpful).

Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/smb2pdu.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index edccb5252462..6e1868611233 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1137,6 +1137,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 
 	cifs_dbg(FYI, "SMB2 IOCTL\n");
 
+	*out_data = NULL;
 	/* zero out returned data len, in case of error */
 	if (plen)
 		*plen = 0;
@@ -1182,11 +1183,23 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 		req->Flags = 0;
 
 	iov[0].iov_base = (char *)req;
-	/* 4 for rfc1002 length field */
-	iov[0].iov_len = get_rfc1002_length(req) + 4;
 
-	if (indatalen)
-		inc_rfc1001_len(req, indatalen);
+	/*
+	 * If no input data, the size of ioctl struct in
+	 * protocol spec still includes a 1 byte data buffer,
+	 * but if input data passed to ioctl, we do not
+	 * want to double count this, so we do not send
+	 * the dummy one byte of data in iovec[0] if sending
+	 * input data (in iovec[1]). We also must add 4 bytes
+	 * in first iovec to allow for rfc1002 length field.
+	 */
+
+	if (indatalen) {
+		iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
+		inc_rfc1001_len(req, indatalen - 1);
+	} else
+		iov[0].iov_len = get_rfc1002_length(req) + 4;
+
 
 	rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
 	rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base;
-- 
cgit v1.3


From 64a5cfa6db94c5abba2cafe77aca077dd1e3283b Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Mon, 14 Oct 2013 15:31:32 -0500
Subject: Allow setting per-file compression via SMB2/3

Allow cifs/smb2/smb3 to return whether or not a file is compressed
via lsattr, and allow SMB2/SMB3 to set the per-file compression
flag ("chattr +c filename" on an smb3 mount).

Windows users often set the compressed flag (it can be
done from the desktop and file manager).  David Disseldorp
has patches to Samba server to support this (at least on btrfs)
which are complementary to this

Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsglob.h  |  2 ++
 fs/cifs/ioctl.c     | 55 +++++++++++++++++++++++++++++++++++------------------
 fs/cifs/smb2ops.c   | 11 +++++++++++
 fs/cifs/smb2pdu.c   | 27 ++++++++++++++++++++++++++
 fs/cifs/smb2pdu.h   |  9 ++++++++-
 fs/cifs/smb2proto.h |  2 ++
 6 files changed, 87 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 52b6f6c26bfc..06e8947fc370 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -278,6 +278,8 @@ struct smb_version_operations {
 	/* set attributes */
 	int (*set_file_info)(struct inode *, const char *, FILE_BASIC_INFO *,
 			     const unsigned int);
+	int (*set_compression)(const unsigned int, struct cifs_tcon *,
+			       struct cifsFileInfo *);
 	/* check if we can send an echo or nor */
 	bool (*can_echo)(struct TCP_Server_Info *);
 	/* send echo request */
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 3e0845585853..029867078aff 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -3,7 +3,7 @@
  *
  *   vfs operations that deal with io control
  *
- *   Copyright (C) International Business Machines  Corp., 2005,2007
+ *   Copyright (C) International Business Machines  Corp., 2005,2013
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -34,13 +34,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 	int rc = -ENOTTY; /* strange error - but the precedent */
 	unsigned int xid;
 	struct cifs_sb_info *cifs_sb;
-#ifdef CONFIG_CIFS_POSIX
 	struct cifsFileInfo *pSMBFile = filep->private_data;
 	struct cifs_tcon *tcon;
 	__u64	ExtAttrBits = 0;
 	__u64	ExtAttrMask = 0;
 	__u64   caps;
-#endif /* CONFIG_CIFS_POSIX */
 
 	xid = get_xid();
 
@@ -49,12 +47,12 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 	cifs_sb = CIFS_SB(inode->i_sb);
 
 	switch (command) {
-#ifdef CONFIG_CIFS_POSIX
 		case FS_IOC_GETFLAGS:
 			if (pSMBFile == NULL)
 				break;
 			tcon = tlink_tcon(pSMBFile->tlink);
 			caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
+#ifdef CONFIG_CIFS_POSIX
 			if (CIFS_UNIX_EXTATTR_CAP & caps) {
 				rc = CIFSGetExtAttr(xid, tcon,
 						    pSMBFile->fid.netfid,
@@ -63,29 +61,50 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 					rc = put_user(ExtAttrBits &
 						FS_FL_USER_VISIBLE,
 						(int __user *)arg);
+				if (rc != EOPNOTSUPP)
+					break;
+			}
+#endif /* CONFIG_CIFS_POSIX */
+			rc = 0;
+			if (CIFS_I(inode)->cifsAttrs & ATTR_COMPRESSED) {
+				/* add in the compressed bit */
+				ExtAttrBits = FS_COMPR_FL;
+				rc = put_user(ExtAttrBits & FS_FL_USER_VISIBLE,
+					      (int __user *)arg);
 			}
 			break;
-
 		case FS_IOC_SETFLAGS:
 			if (pSMBFile == NULL)
 				break;
 			tcon = tlink_tcon(pSMBFile->tlink);
 			caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
-			if (CIFS_UNIX_EXTATTR_CAP & caps) {
-				if (get_user(ExtAttrBits, (int __user *)arg)) {
-					rc = -EFAULT;
-					break;
-				}
-				/*
-				 * rc = CIFSGetExtAttr(xid, tcon,
-				 *		       pSMBFile->fid.netfid,
-				 *		       extAttrBits,
-				 *		       &ExtAttrMask);
-				 */
+
+			if (get_user(ExtAttrBits, (int __user *)arg)) {
+				rc = -EFAULT;
+				break;
+			}
+
+			/*
+			 * if (CIFS_UNIX_EXTATTR_CAP & caps)
+			 *	rc = CIFSSetExtAttr(xid, tcon,
+			 *		       pSMBFile->fid.netfid,
+			 *		       extAttrBits,
+			 *		       &ExtAttrMask);
+			 * if (rc != EOPNOTSUPP)
+			 *	break;
+			 */
+
+			/* Currently only flag we can set is compressed flag */
+			if ((ExtAttrBits & FS_COMPR_FL) == 0)
+				break;
+
+			/* Try to set compress flag */
+			if (tcon->ses->server->ops->set_compression) {
+				rc = tcon->ses->server->ops->set_compression(
+							xid, tcon, pSMBFile);
+				cifs_dbg(FYI, "set compress flag rc %d\n", rc);
 			}
-			cifs_dbg(FYI, "set flags not implemented yet\n");
 			break;
-#endif /* CONFIG_CIFS_POSIX */
 		default:
 			cifs_dbg(FYI, "unsupported ioctl\n");
 			break;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 861b33214144..b96bacce955a 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -445,6 +445,14 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
 			    cfile->fid.volatile_fid, cfile->pid, &eof);
 }
 
+static int
+smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+		   struct cifsFileInfo *cfile)
+{
+	return SMB2_set_compression(xid, tcon, cfile->fid.persistent_fid,
+			    cfile->fid.volatile_fid);
+}
+
 static int
 smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
 		     const char *path, struct cifs_sb_info *cifs_sb,
@@ -874,6 +882,7 @@ struct smb_version_operations smb20_operations = {
 	.set_path_size = smb2_set_path_size,
 	.set_file_size = smb2_set_file_size,
 	.set_file_info = smb2_set_file_info,
+	.set_compression = smb2_set_compression,
 	.mkdir = smb2_mkdir,
 	.mkdir_setinfo = smb2_mkdir_setinfo,
 	.rmdir = smb2_rmdir,
@@ -945,6 +954,7 @@ struct smb_version_operations smb21_operations = {
 	.set_path_size = smb2_set_path_size,
 	.set_file_size = smb2_set_file_size,
 	.set_file_info = smb2_set_file_info,
+	.set_compression = smb2_set_compression,
 	.mkdir = smb2_mkdir,
 	.mkdir_setinfo = smb2_mkdir_setinfo,
 	.rmdir = smb2_rmdir,
@@ -1017,6 +1027,7 @@ struct smb_version_operations smb30_operations = {
 	.set_path_size = smb2_set_path_size,
 	.set_file_size = smb2_set_file_size,
 	.set_file_info = smb2_set_file_info,
+	.set_compression = smb2_set_compression,
 	.mkdir = smb2_mkdir,
 	.mkdir_setinfo = smb2_mkdir_setinfo,
 	.rmdir = smb2_rmdir,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 6e1868611233..bbafa12e83b2 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1247,6 +1247,33 @@ ioctl_exit:
 	return rc;
 }
 
+/*
+ *   Individual callers to ioctl worker function follow
+ */
+
+int
+SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+		     u64 persistent_fid, u64 volatile_fid)
+{
+	int rc;
+	char *res_key = NULL;
+	struct  compress_ioctl fsctl_input;
+	char *ret_data = NULL;
+
+	fsctl_input.CompressionState =
+			__constant_cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
+
+	rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
+			FSCTL_SET_COMPRESSION, true /* is_fsctl */,
+			(char *)&fsctl_input /* data input */,
+			2 /* in data len */, &ret_data /* out data */, NULL);
+
+	cifs_dbg(FYI, "set compression rc %d\n", rc);
+	kfree(res_key);
+
+	return rc;
+}
+
 int
 SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
 	   u64 persistent_fid, u64 volatile_fid)
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index b83d0118a757..c7c3c8294d1a 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -569,6 +569,13 @@ struct network_interface_info_ioctl_rsp {
 
 #define NO_FILE_ID 0xFFFFFFFFFFFFFFFFULL /* general ioctls to srv not to file */
 
+struct compress_ioctl {
+	__le16 CompressionState;
+} __packed;
+
+#define COMPRESSION_FORMAT_NONE		0x0000
+#define COMPRESSION_FORMAT_DEFAULT	0x0001
+#define COMPRESSION_FORMAT_LZNT1	0x0002
 struct smb2_ioctl_req {
 	struct smb2_hdr hdr;
 	__le16 StructureSize;	/* Must be 57 */
@@ -584,7 +591,7 @@ struct smb2_ioctl_req {
 	__le32 MaxOutputResponse;
 	__le32 Flags;
 	__u32  Reserved2;
-	char   Buffer[0];
+	__u8   Buffer[0];
 } __packed;
 
 struct smb2_ioctl_rsp {
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index e3fb4801ee96..3cf22e39420d 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -142,6 +142,8 @@ extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon,
 extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon,
 			 u64 persistent_fid, u64 volatile_fid,
 			 FILE_BASIC_INFO *buf);
+extern int SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+				u64 persistent_fid, u64 volatile_fid);
 extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
 			     const u64 persistent_fid, const u64 volatile_fid,
 			     const __u8 oplock_level);
-- 
cgit v1.3


From 34f626406c09dd45878ce75170abab342985ec24 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Wed, 9 Oct 2013 02:07:00 -0500
Subject: Query file system attributes from server on SMB2, not just cifs,
 mounts

Currently SMB2 and SMB3 mounts do not query the file system attributes
from the server at mount time as is done for cifs.  These can be useful for debugging.

Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifspdu.h   |  3 +++
 fs/cifs/smb2ops.c   | 28 ++++++++++++++++++++++++++++
 fs/cifs/smb2pdu.c   | 42 ++++++++++++++++++++++++++++++++++++++++--
 fs/cifs/smb2proto.h |  2 ++
 4 files changed, 73 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 08f9dfb1a894..d40bd77a2392 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -2215,6 +2215,9 @@ typedef struct {
 	__le32 DeviceCharacteristics;
 } __attribute__((packed)) FILE_SYSTEM_DEVICE_INFO; /* device info level 0x104 */
 
+/* minimum includes first three fields, and empty FS Name */
+#define MIN_FS_ATTR_INFO_SIZE 12
+
 typedef struct {
 	__le32 Attributes;
 	__le32 MaxPathNameComponentLength;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index b96bacce955a..a53e2053ae9d 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -209,6 +209,31 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
 	return rsize;
 }
 
+static void
+smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
+{
+	int rc;
+	__le16 srch_path = 0; /* Null - open root of share */
+	u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+	struct cifs_open_parms oparms;
+	struct cifs_fid fid;
+
+	oparms.tcon = tcon;
+	oparms.desired_access = FILE_READ_ATTRIBUTES;
+	oparms.disposition = FILE_OPEN;
+	oparms.create_options = 0;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL);
+	if (rc)
+		return;
+
+	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+	SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+	return;
+}
+
 static int
 smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
 			struct cifs_sb_info *cifs_sb, const char *full_path)
@@ -873,6 +898,7 @@ struct smb_version_operations smb20_operations = {
 	.logoff = SMB2_logoff,
 	.tree_connect = SMB2_tcon,
 	.tree_disconnect = SMB2_tdis,
+	.qfs_tcon = smb2_qfs_tcon,
 	.is_path_accessible = smb2_is_path_accessible,
 	.can_echo = smb2_can_echo,
 	.echo = SMB2_echo,
@@ -945,6 +971,7 @@ struct smb_version_operations smb21_operations = {
 	.logoff = SMB2_logoff,
 	.tree_connect = SMB2_tcon,
 	.tree_disconnect = SMB2_tdis,
+	.qfs_tcon = smb2_qfs_tcon,
 	.is_path_accessible = smb2_is_path_accessible,
 	.can_echo = smb2_can_echo,
 	.echo = SMB2_echo,
@@ -1018,6 +1045,7 @@ struct smb_version_operations smb30_operations = {
 	.logoff = SMB2_logoff,
 	.tree_connect = SMB2_tcon,
 	.tree_disconnect = SMB2_tdis,
+	.qfs_tcon = smb2_qfs_tcon,
 	.is_path_accessible = smb2_is_path_accessible,
 	.can_echo = smb2_can_echo,
 	.echo = SMB2_echo,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index bbafa12e83b2..df12cf8bd979 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -2339,7 +2339,7 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
 	rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, 0);
 	if (rc) {
 		cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
-		goto qinf_exit;
+		goto qfsinf_exit;
 	}
 	rsp = (struct smb2_query_info_rsp *)iov.iov_base;
 
@@ -2351,7 +2351,45 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
 	if (!rc)
 		copy_fs_info_to_kstatfs(info, fsdata);
 
-qinf_exit:
+qfsinf_exit:
+	free_rsp_buf(resp_buftype, iov.iov_base);
+	return rc;
+}
+
+int
+SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
+	      u64 persistent_fid, u64 volatile_fid)
+{
+	struct smb2_query_info_rsp *rsp = NULL;
+	struct kvec iov;
+	int rc = 0;
+	int resp_buftype;
+	struct cifs_ses *ses = tcon->ses;
+	unsigned int rsp_len, offset;
+
+	rc = build_qfs_info_req(&iov, tcon, SMB_QUERY_FS_ATTRIBUTE_INFO,
+				sizeof(FILE_SYSTEM_ATTRIBUTE_INFO),
+				persistent_fid, volatile_fid);
+	if (rc)
+		return rc;
+
+	rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, 0);
+	if (rc) {
+		cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
+		goto qfsattr_exit;
+	}
+	rsp = (struct smb2_query_info_rsp *)iov.iov_base;
+
+	rsp_len = le32_to_cpu(rsp->OutputBufferLength);
+	offset = le16_to_cpu(rsp->OutputBufferOffset);
+	rc = validate_buf(offset, rsp_len, &rsp->hdr, MIN_FS_ATTR_INFO_SIZE);
+	if (!rc) {
+		memcpy(&tcon->fsAttrInfo, 4 /* RFC1001 len */ + offset
+			+ (char *)&rsp->hdr, min_t(unsigned int,
+			rsp_len, sizeof(FILE_SYSTEM_ATTRIBUTE_INFO)));
+	}
+
+qfsattr_exit:
 	free_rsp_buf(resp_buftype, iov.iov_base);
 	return rc;
 }
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 3cf22e39420d..68dc00d5fb12 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -150,6 +150,8 @@ extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
 extern int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
 			 u64 persistent_file_id, u64 volatile_file_id,
 			 struct kstatfs *FSData);
+extern int SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
+			 u64 persistent_file_id, u64 volatile_file_id);
 extern int SMB2_lock(const unsigned int xid, struct cifs_tcon *tcon,
 		     const __u64 persist_fid, const __u64 volatile_fid,
 		     const __u32 pid, const __u64 length, const __u64 offset,
-- 
cgit v1.3


From 944d6f1a5b8f42a780a65378e5f52bea3ae0ce07 Mon Sep 17 00:00:00 2001
From: Tim Gardner <timg@tpi.com>
Date: Wed, 16 Oct 2013 09:09:49 -0600
Subject: cifs: Remove redundant multiplex identifier check from
 check_smb_hdr()

The only call site for check_smb_header() assigns 'mid' from the SMB
packet, which is then checked again in check_smb_header(). This seems
like redundant redundancy.

Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Tim Gardner <timg@tpi.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/misc.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 138a011633fe..298e31e3bdc6 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -278,7 +278,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 }
 
 static int
-check_smb_hdr(struct smb_hdr *smb, __u16 mid)
+check_smb_hdr(struct smb_hdr *smb)
 {
 	/* does it have the right SMB "signature" ? */
 	if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) {
@@ -287,13 +287,6 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid)
 		return 1;
 	}
 
-	/* Make sure that message ids match */
-	if (mid != smb->Mid) {
-		cifs_dbg(VFS, "Mids do not match. received=%u expected=%u\n",
-			 smb->Mid, mid);
-		return 1;
-	}
-
 	/* if it's a response then accept */
 	if (smb->Flags & SMBFLG_RESPONSE)
 		return 0;
@@ -310,7 +303,6 @@ int
 checkSMB(char *buf, unsigned int total_read)
 {
 	struct smb_hdr *smb = (struct smb_hdr *)buf;
-	__u16 mid = smb->Mid;
 	__u32 rfclen = be32_to_cpu(smb->smb_buf_length);
 	__u32 clc_len;  /* calculated length */
 	cifs_dbg(FYI, "checkSMB Length: 0x%x, smb_buf_length: 0x%x\n",
@@ -348,7 +340,7 @@ checkSMB(char *buf, unsigned int total_read)
 	}
 
 	/* otherwise, there is enough to get to the BCC */
-	if (check_smb_hdr(smb, mid))
+	if (check_smb_hdr(smb))
 		return -EIO;
 	clc_len = smbCalcSize(smb);
 
-- 
cgit v1.3


From 3660cd4322fce986689b06225d9c12d77193c252 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Thu, 3 Oct 2013 12:23:24 -0400
Subject: NFSv4 Remove zeroing state kern warnings

As of commit 5d422301f97b821301efcdb6fc9d1a83a5c102d6 we no longer zero the
state.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4state.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index cc14cbb78b73..fa2706a4225a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1375,8 +1375,8 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
 			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 				goto out;
 			default:
-				printk(KERN_ERR "NFS: %s: unhandled error %d. "
-					"Zeroing state\n", __func__, status);
+				printk(KERN_ERR "NFS: %s: unhandled error %d\n",
+					 __func__, status);
 			case -ENOMEM:
 			case -NFS4ERR_DENIED:
 			case -NFS4ERR_RECLAIM_BAD:
@@ -1439,15 +1439,12 @@ restart:
 		}
 		switch (status) {
 			default:
-				printk(KERN_ERR "NFS: %s: unhandled error %d. "
-					"Zeroing state\n", __func__, status);
+				printk(KERN_ERR "NFS: %s: unhandled error %d\n",
+					__func__, status);
 			case -ENOENT:
 			case -ENOMEM:
 			case -ESTALE:
-				/*
-				 * Open state on this file cannot be recovered
-				 * All we can do is revert to using the zero stateid.
-				 */
+				/* Open state on this file cannot be recovered */
 				nfs4_state_mark_recovery_failed(state, status);
 				break;
 			case -EAGAIN:
-- 
cgit v1.3


From 57acc40d73407159727b3a1456f0a498133831ba Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 18 Oct 2013 10:03:37 -0400
Subject: nfs: reject version and minorversion changes on remount attempts

Reported-by: Eric Doutreleau <edoutreleau@genoscope.cns.fr>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a03b9c6f9489..137572bb7037 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2143,6 +2143,8 @@ nfs_compare_remount_data(struct nfs_server *nfss,
 	if (data->flags != nfss->flags ||
 	    data->rsize != nfss->rsize ||
 	    data->wsize != nfss->wsize ||
+	    data->version != nfss->nfs_client->rpc_ops->version ||
+	    data->minorversion != nfss->nfs_client->cl_minorversion ||
 	    data->retrans != nfss->client->cl_timeout->to_retries ||
 	    data->auth_flavors[0] != nfss->client->cl_auth->au_flavor ||
 	    data->acregmin != nfss->acregmin / HZ ||
@@ -2197,6 +2199,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;
 	data->nfs_server.port = nfss->port;
 	data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
+	data->version = nfsvers;
+	data->minorversion = nfss->nfs_client->cl_minorversion;
 	memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
 		data->nfs_server.addrlen);
 
-- 
cgit v1.3


From 1966903f8e28b31ff82de2e2180f0c066399288d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 21 Oct 2013 09:52:19 -0400
Subject: nfs: fix handling of invalid mount options in nfs_remount

nfs_parse_mount_options returns 0 on error, not -errno.

Reported-by: Karel Zak <kzak@redhat.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 137572bb7037..3f5a7a85c9c2 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2205,8 +2205,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 		data->nfs_server.addrlen);
 
 	/* overwrite those values with any that were specified */
-	error = nfs_parse_mount_options((char *)options, data);
-	if (error < 0)
+	error = -EINVAL;
+	if (!nfs_parse_mount_options((char *)options, data))
 		goto out;
 
 	/*
-- 
cgit v1.3


From 83c78eb0425f587a34a6644da79bdcde987dc974 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 28 Oct 2013 14:50:38 -0400
Subject: NFSv4.1: Don't change the security label as part of open reclaim.

The current caching model calls for the security label to be set on
first lookup and/or on any subsequent label changes. There is no
need to do it as part of an open reclaim.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d2b4845ecb4d..8dd61eb86f6f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1337,8 +1337,6 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
 	if (ret)
 		goto err;
 
-	nfs_setsecurity(inode, &data->f_attr, data->f_label);
-
 	if (data->o_res.delegation_type != 0)
 		nfs4_opendata_check_deleg(data, state);
 	update_open_stateid(state, &data->o_res.stateid, NULL,
-- 
cgit v1.3


From f494a6071d31e3294a3b51ad7a3684f983953f9f Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Mon, 21 Oct 2013 13:10:10 -0400
Subject: NFSv4: fix NULL dereference in open recover

_nfs4_opendata_reclaim_to_nfs4_state doesn't expect to see a cached
open CLAIM_PREVIOUS, but this can happen. An example is when there are
RDWR openers and RDONLY openers on a delegation stateid. The recovery
path will first try an open CLAIM_PREVIOUS for the RDWR openers, this
marks the delegation as not needing RECLAIM anymore, so the open
CLAIM_PREVIOUS for the RDONLY openers will not actually send an rpc.

The NULL dereference is due to _nfs4_opendata_reclaim_to_nfs4_state
returning PTR_ERR(rpc_status) when !rpc_done. When the open is
cached, rpc_done == 0 and rpc_status == 0, thus
_nfs4_opendata_reclaim_to_nfs4_state returns NULL - this is unexpected
by callers of nfs4_opendata_to_nfs4_state().

This can be reproduced easily by opening the same file two times on an
NFSv4.0 mount with delegations enabled, once as RDWR and once as RDONLY then
sleeping for a long time.  While the files are held open, kick off state
recovery and this NULL dereference will be hit every time.

An example OOPS:

[   65.003602] BUG: unable to handle kernel NULL pointer dereference at 00000000
00000030
[   65.005312] IP: [<ffffffffa037d6ee>] __nfs4_close+0x1e/0x160 [nfsv4]
[   65.006820] PGD 7b0ea067 PUD 791ff067 PMD 0
[   65.008075] Oops: 0000 [#1] SMP
[   65.008802] Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache
snd_ens1371 gameport nfsd snd_rawmidi snd_ac97_codec ac97_bus btusb snd_seq snd
_seq_device snd_pcm ppdev bluetooth auth_rpcgss coretemp snd_page_alloc crc32_pc
lmul crc32c_intel ghash_clmulni_intel microcode rfkill nfs_acl vmw_balloon serio
_raw snd_timer lockd parport_pc e1000 snd soundcore parport i2c_piix4 shpchp vmw
_vmci sunrpc ata_generic mperf pata_acpi mptspi vmwgfx ttm scsi_transport_spi dr
m mptscsih mptbase i2c_core
[   65.018684] CPU: 0 PID: 473 Comm: 192.168.10.85-m Not tainted 3.11.2-201.fc19
.x86_64 #1
[   65.020113] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop
Reference Platform, BIOS 6.00 07/31/2013
[   65.022012] task: ffff88003707e320 ti: ffff88007b906000 task.ti: ffff88007b906000
[   65.023414] RIP: 0010:[<ffffffffa037d6ee>]  [<ffffffffa037d6ee>] __nfs4_close+0x1e/0x160 [nfsv4]
[   65.025079] RSP: 0018:ffff88007b907d10  EFLAGS: 00010246
[   65.026042] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
[   65.027321] RDX: 0000000000000050 RSI: 0000000000000001 RDI: 0000000000000000
[   65.028691] RBP: ffff88007b907d38 R08: 0000000000016f60 R09: 0000000000000000
[   65.029990] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001
[   65.031295] R13: 0000000000000050 R14: 0000000000000000 R15: 0000000000000001
[   65.032527] FS:  0000000000000000(0000) GS:ffff88007f600000(0000) knlGS:0000000000000000
[   65.033981] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   65.035177] CR2: 0000000000000030 CR3: 000000007b27f000 CR4: 00000000000407f0
[   65.036568] Stack:
[   65.037011]  0000000000000000 0000000000000001 ffff88007b907d90 ffff88007a880220
[   65.038472]  ffff88007b768de8 ffff88007b907d48 ffffffffa037e4a5 ffff88007b907d80
[   65.039935]  ffffffffa036a6c8 ffff880037020e40 ffff88007a880000 ffff880037020e40
[   65.041468] Call Trace:
[   65.042050]  [<ffffffffa037e4a5>] nfs4_close_state+0x15/0x20 [nfsv4]
[   65.043209]  [<ffffffffa036a6c8>] nfs4_open_recover_helper+0x148/0x1f0 [nfsv4]
[   65.044529]  [<ffffffffa036a886>] nfs4_open_recover+0x116/0x150 [nfsv4]
[   65.045730]  [<ffffffffa036d98d>] nfs4_open_reclaim+0xad/0x150 [nfsv4]
[   65.046905]  [<ffffffffa037d979>] nfs4_do_reclaim+0x149/0x5f0 [nfsv4]
[   65.048071]  [<ffffffffa037e1dc>] nfs4_run_state_manager+0x3bc/0x670 [nfsv4]
[   65.049436]  [<ffffffffa037de20>] ? nfs4_do_reclaim+0x5f0/0x5f0 [nfsv4]
[   65.050686]  [<ffffffffa037de20>] ? nfs4_do_reclaim+0x5f0/0x5f0 [nfsv4]
[   65.051943]  [<ffffffff81088640>] kthread+0xc0/0xd0
[   65.052831]  [<ffffffff81088580>] ? insert_kthread_work+0x40/0x40
[   65.054697]  [<ffffffff8165686c>] ret_from_fork+0x7c/0xb0
[   65.056396]  [<ffffffff81088580>] ? insert_kthread_work+0x40/0x40
[   65.058208] Code: 5c 41 5d 5d c3 0f 1f 84 00 00 00 00 00 66 66 66 66 90 55 48 89 e5 41 57 41 89 f7 41 56 41 89 ce 41 55 41 89 d5 41 54 53 48 89 fb <4c> 8b 67 30 f0 41 ff 44 24 44 49 8d 7c 24 40 e8 0e 0a 2d e1 44
[   65.065225] RIP  [<ffffffffa037d6ee>] __nfs4_close+0x1e/0x160 [nfsv4]
[   65.067175]  RSP <ffff88007b907d10>
[   65.068570] CR2: 0000000000000030
[   65.070098] ---[ end trace 0d1fe4f5c7dd6f8b ]---

Cc: <stable@vger.kernel.org> #3.7+
Signed-off-by: Weston Andros Adamson <dros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8dd61eb86f6f..acd20c549ce2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1317,7 +1317,8 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
 	struct nfs4_state *state = data->state;
 	int ret;
 
-	if (!data->rpc_done) {
+	/* allow cached opens (!rpc_done && !rpc_status) */
+	if (!data->rpc_done && data->rpc_status) {
 		ret = data->rpc_status;
 		goto err;
 	}
-- 
cgit v1.3


From a43ec98b72aae3e330f0673438f58316c3769b84 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Mon, 21 Oct 2013 13:10:11 -0400
Subject: NFSv4: don't fail on missing fattr in open recover

This is an unneeded check that could cause the client to fail to recover
opens.

Signed-off-by: Weston Andros Adamson <dros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index acd20c549ce2..e76fd218ec8e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1323,12 +1323,6 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
 		goto err;
 	}
 
-	ret = -ESTALE;
-	if (!(data->f_attr.valid & NFS_ATTR_FATTR_TYPE) ||
-	    !(data->f_attr.valid & NFS_ATTR_FATTR_FILEID) ||
-	    !(data->f_attr.valid & NFS_ATTR_FATTR_CHANGE))
-		goto err;
-
 	ret = -ENOMEM;
 	state = nfs4_get_open_state(inode, data->owner);
 	if (state == NULL)
-- 
cgit v1.3


From d49f042aeec99c5f87160bb52dd52088b1051311 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 28 Oct 2013 14:57:12 -0400
Subject: NFSv4: Fix state reference counting in
 _nfs4_opendata_reclaim_to_nfs4_state

Currently, if the call to nfs_refresh_inode fails, then we end up leaking
a reference count, due to the call to nfs4_get_open_state.
While we're at it, replace nfs4_get_open_state with a simple call to
atomic_inc(); there is no need to do a full lookup of the struct nfs_state
since it is passed as an argument in the struct nfs4_opendata, and
is already assigned to the variable 'state'.

Cc: stable@vger.kernel.org # 3.7.x: a43ec98b72a: NFSv4: don't fail on missing
Cc: stable@vger.kernel.org # 3.7.x
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e76fd218ec8e..d3c4255be6e0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1323,11 +1323,6 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
 		goto err;
 	}
 
-	ret = -ENOMEM;
-	state = nfs4_get_open_state(inode, data->owner);
-	if (state == NULL)
-		goto err;
-
 	ret = nfs_refresh_inode(inode, &data->f_attr);
 	if (ret)
 		goto err;
@@ -1336,6 +1331,7 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
 		nfs4_opendata_check_deleg(data, state);
 	update_open_stateid(state, &data->o_res.stateid, NULL,
 			    data->o_arg.fmode);
+	atomic_inc(&state->count);
 
 	return state;
 err:
-- 
cgit v1.3


From d2bfda2e7aa036f90ccea610a657064b1e267913 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Mon, 21 Oct 2013 13:10:13 -0400
Subject: NFSv4: don't reprocess cached open CLAIM_PREVIOUS

Cached opens have already been handled by _nfs4_opendata_reclaim_to_nfs4_state
and can safely skip being reprocessed, but must still call update_open_stateid
to make sure that all active fmodes are recovered.

Signed-off-by: Weston Andros Adamson <dros@netapp.com>
Cc: stable@vger.kernel.org # 3.7.x: f494a6071d3: NFSv4: fix NULL dereference
Cc: stable@vger.kernel.org # 3.7.x: a43ec98b72a: NFSv4: don't fail on missin
Cc: stable@vger.kernel.org # 3.7.x
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d3c4255be6e0..259f8beb96a3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1317,10 +1317,13 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
 	struct nfs4_state *state = data->state;
 	int ret;
 
-	/* allow cached opens (!rpc_done && !rpc_status) */
-	if (!data->rpc_done && data->rpc_status) {
-		ret = data->rpc_status;
-		goto err;
+	if (!data->rpc_done) {
+		if (data->rpc_status) {
+			ret = data->rpc_status;
+			goto err;
+		}
+		/* cached opens have already been processed */
+		goto update;
 	}
 
 	ret = nfs_refresh_inode(inode, &data->f_attr);
@@ -1329,6 +1332,7 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
 
 	if (data->o_res.delegation_type != 0)
 		nfs4_opendata_check_deleg(data, state);
+update:
 	update_open_stateid(state, &data->o_res.stateid, NULL,
 			    data->o_arg.fmode);
 	atomic_inc(&state->count);
-- 
cgit v1.3


From 32e62b7c3ef095eccbb6a8c96fddf05dacc749df Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:12:28 -0400
Subject: NFS: Add nfs4_update_server

New function nfs4_update_server() moves an nfs_server to a different
nfs_client.  This is done as part of migration recovery.

Though it may be appealing to think of them as the same thing,
migration recovery is not the same as following a referral.

For a referral, the client has not descended into the file system
yet: it has no nfs_server, no super block, no inodes or open state.
It is enough to simply instantiate the nfs_server and super block,
and perform a referral mount.

For a migration, however, we have all of those things already, and
they have to be moved to a different nfs_client.  No local namespace
changes are needed here.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c     |   3 +-
 fs/nfs/internal.h   |   3 ++
 fs/nfs/nfs4client.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 113 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index af0325864df6..692fd0e9362f 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -945,7 +945,7 @@ void nfs_server_insert_lists(struct nfs_server *server)
 }
 EXPORT_SYMBOL_GPL(nfs_server_insert_lists);
 
-static void nfs_server_remove_lists(struct nfs_server *server)
+void nfs_server_remove_lists(struct nfs_server *server)
 {
 	struct nfs_client *clp = server->nfs_client;
 	struct nfs_net *nn;
@@ -962,6 +962,7 @@ static void nfs_server_remove_lists(struct nfs_server *server)
 
 	synchronize_rcu();
 }
+EXPORT_SYMBOL_GPL(nfs_server_remove_lists);
 
 /*
  * Allocate and initialise a server record
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 38da8c2b81ac..e5a6bd12562a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -154,6 +154,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *,
 				  rpc_authflavor_t);
 int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
 void nfs_server_insert_lists(struct nfs_server *);
+void nfs_server_remove_lists(struct nfs_server *);
 void nfs_init_timeout_values(struct rpc_timeout *, int, unsigned int, unsigned int);
 int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t,
 		rpc_authflavor_t);
@@ -174,6 +175,8 @@ extern struct nfs_server *nfs4_create_server(
 					struct nfs_subversion *);
 extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *,
 						      struct nfs_fh *);
+extern int nfs4_update_server(struct nfs_server *server, const char *hostname,
+					struct sockaddr *sap, size_t salen);
 extern void nfs_free_server(struct nfs_server *server);
 extern struct nfs_server *nfs_clone_server(struct nfs_server *,
 					   struct nfs_fh *,
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 511cdce6ecf2..c6eee812337f 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -1092,3 +1092,111 @@ error:
 	dprintk("<-- nfs4_create_referral_server() = error %d\n", error);
 	return ERR_PTR(error);
 }
+
+/*
+ * Grab the destination's particulars, including lease expiry time.
+ *
+ * Returns zero if probe succeeded and retrieved FSID matches the FSID
+ * we have cached.
+ */
+static int nfs_probe_destination(struct nfs_server *server)
+{
+	struct inode *inode = server->super->s_root->d_inode;
+	struct nfs_fattr *fattr;
+	int error;
+
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		return -ENOMEM;
+
+	/* Sanity: the probe won't work if the destination server
+	 * does not recognize the migrated FH. */
+	error = nfs_probe_fsinfo(server, NFS_FH(inode), fattr);
+
+	nfs_free_fattr(fattr);
+	return error;
+}
+
+/**
+ * nfs4_update_server - Move an nfs_server to a different nfs_client
+ *
+ * @server: represents FSID to be moved
+ * @hostname: new end-point's hostname
+ * @sap: new end-point's socket address
+ * @salen: size of "sap"
+ *
+ * The nfs_server must be quiescent before this function is invoked.
+ * Either its session is drained (NFSv4.1+), or its transport is
+ * plugged and drained (NFSv4.0).
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs4_update_server(struct nfs_server *server, const char *hostname,
+		       struct sockaddr *sap, size_t salen)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct rpc_clnt *clnt = server->client;
+	struct xprt_create xargs = {
+		.ident		= clp->cl_proto,
+		.net		= &init_net,
+		.dstaddr	= sap,
+		.addrlen	= salen,
+		.servername	= hostname,
+	};
+	char buf[INET6_ADDRSTRLEN + 1];
+	struct sockaddr_storage address;
+	struct sockaddr *localaddr = (struct sockaddr *)&address;
+	int error;
+
+	dprintk("--> %s: move FSID %llx:%llx to \"%s\")\n", __func__,
+			(unsigned long long)server->fsid.major,
+			(unsigned long long)server->fsid.minor,
+			hostname);
+
+	error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout);
+	if (error != 0) {
+		dprintk("<-- %s(): rpc_switch_client_transport returned %d\n",
+			__func__, error);
+		goto out;
+	}
+
+	error = rpc_localaddr(clnt, localaddr, sizeof(address));
+	if (error != 0) {
+		dprintk("<-- %s(): rpc_localaddr returned %d\n",
+			__func__, error);
+		goto out;
+	}
+
+	error = -EAFNOSUPPORT;
+	if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0) {
+		dprintk("<-- %s(): rpc_ntop returned %d\n",
+			__func__, error);
+		goto out;
+	}
+
+	nfs_server_remove_lists(server);
+	error = nfs4_set_client(server, hostname, sap, salen, buf,
+				clp->cl_rpcclient->cl_auth->au_flavor,
+				clp->cl_proto, clnt->cl_timeout,
+				clp->cl_minorversion, clp->cl_net);
+	nfs_put_client(clp);
+	if (error != 0) {
+		nfs_server_insert_lists(server);
+		dprintk("<-- %s(): nfs4_set_client returned %d\n",
+			__func__, error);
+		goto out;
+	}
+
+	if (server->nfs_client->cl_hostname == NULL)
+		server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL);
+	nfs_server_insert_lists(server);
+
+	error = nfs_probe_destination(server);
+	if (error < 0)
+		goto out;
+
+	dprintk("<-- %s() succeeded\n", __func__);
+
+out:
+	return error;
+}
-- 
cgit v1.3


From 800c06a5bf497e0587ba8382a761f31a7825e2cd Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:12:34 -0400
Subject: NFS: Add functions to swap transports during migration recovery

Introduce functions that can walk through an array of returned
fs_locations information and connect a transport to one of the
destination servers listed therein.

Note that NFS minor version 1 introduces "fs_locations_info" which
extends the locations array sorting criteria available to clients.
This is not supported yet.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h       |   2 +
 fs/nfs/nfs4namespace.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 28842abafab4..fcae7289d557 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -217,6 +217,8 @@ rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *);
 struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *, struct inode *, struct qstr *);
 struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *,
 			       struct nfs_fh *, struct nfs_fattr *);
+int nfs4_replace_transport(struct nfs_server *server,
+				const struct nfs4_fs_locations *locations);
 
 /* nfs4proc.c */
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 2288cd3c9278..ebd8b064843b 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -400,3 +400,104 @@ out:
 	rpc_shutdown_client(client);
 	return mnt;
 }
+
+/*
+ * Try one location from the fs_locations array.
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+static int nfs4_try_replacing_one_location(struct nfs_server *server,
+		char *page, char *page2,
+		const struct nfs4_fs_location *location)
+{
+	const size_t addr_bufsize = sizeof(struct sockaddr_storage);
+	struct sockaddr *sap;
+	unsigned int s;
+	size_t salen;
+	int error;
+
+	sap = kmalloc(addr_bufsize, GFP_KERNEL);
+	if (sap == NULL)
+		return -ENOMEM;
+
+	error = -ENOENT;
+	for (s = 0; s < location->nservers; s++) {
+		const struct nfs4_string *buf = &location->servers[s];
+		char *hostname;
+
+		if (buf->len <= 0 || buf->len > PAGE_SIZE)
+			continue;
+
+		if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len) != NULL)
+			continue;
+
+		salen = nfs_parse_server_name(buf->data, buf->len,
+						sap, addr_bufsize, server);
+		if (salen == 0)
+			continue;
+		rpc_set_port(sap, NFS_PORT);
+
+		error = -ENOMEM;
+		hostname = kstrndup(buf->data, buf->len, GFP_KERNEL);
+		if (hostname == NULL)
+			break;
+
+		error = nfs4_update_server(server, hostname, sap, salen);
+		kfree(hostname);
+		if (error == 0)
+			break;
+	}
+
+	kfree(sap);
+	return error;
+}
+
+/**
+ * nfs4_replace_transport - set up transport to destination server
+ *
+ * @server: export being migrated
+ * @locations: fs_locations array
+ *
+ * Returns zero on success, or a negative errno value.
+ *
+ * The client tries all the entries in the "locations" array, in the
+ * order returned by the server, until one works or the end of the
+ * array is reached.
+ */
+int nfs4_replace_transport(struct nfs_server *server,
+			   const struct nfs4_fs_locations *locations)
+{
+	char *page = NULL, *page2 = NULL;
+	int loc, error;
+
+	error = -ENOENT;
+	if (locations == NULL || locations->nlocations <= 0)
+		goto out;
+
+	error = -ENOMEM;
+	page = (char *) __get_free_page(GFP_USER);
+	if (!page)
+		goto out;
+	page2 = (char *) __get_free_page(GFP_USER);
+	if (!page2)
+		goto out;
+
+	for (loc = 0; loc < locations->nlocations; loc++) {
+		const struct nfs4_fs_location *location =
+						&locations->locations[loc];
+
+		if (location == NULL || location->nservers <= 0 ||
+		    location->rootpath.ncomponents == 0)
+			continue;
+
+		error = nfs4_try_replacing_one_location(server, page,
+							page2, location);
+		if (error == 0)
+			break;
+	}
+
+out:
+	free_page((unsigned long)page);
+	free_page((unsigned long)page2);
+	return error;
+}
-- 
cgit v1.3


From ec011fe847347b40c60fdb5085f65227762e2e08 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:12:39 -0400
Subject: NFS: Introduce a vector of migration recovery ops

The differences between minor version 0 and minor version 1
migration will be abstracted by the addition of a set of migration
recovery ops.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h  |  4 ++++
 fs/nfs/nfs4proc.c | 10 ++++++++++
 2 files changed, 14 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index fcae7289d557..4f48000b2b97 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -50,6 +50,7 @@ struct nfs4_minor_version_ops {
 	const struct nfs4_state_recovery_ops *reboot_recovery_ops;
 	const struct nfs4_state_recovery_ops *nograce_recovery_ops;
 	const struct nfs4_state_maintenance_ops *state_renewal_ops;
+	const struct nfs4_mig_recovery_ops *mig_recovery_ops;
 };
 
 #define NFS_SEQID_CONFIRMED 1
@@ -203,6 +204,9 @@ struct nfs4_state_maintenance_ops {
 	int (*renew_lease)(struct nfs_client *, struct rpc_cred *);
 };
 
+struct nfs4_mig_recovery_ops {
+};
+
 extern const struct dentry_operations nfs4_dentry_operations;
 
 /* dir.c */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 259f8beb96a3..2aacd1357dc5 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7881,6 +7881,14 @@ static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
 };
 #endif
 
+static const struct nfs4_mig_recovery_ops nfs40_mig_recovery_ops = {
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static const struct nfs4_mig_recovery_ops nfs41_mig_recovery_ops = {
+};
+#endif	/* CONFIG_NFS_V4_1 */
+
 static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
 	.minor_version = 0,
 	.init_caps = NFS_CAP_READDIRPLUS
@@ -7896,6 +7904,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
 	.reboot_recovery_ops = &nfs40_reboot_recovery_ops,
 	.nograce_recovery_ops = &nfs40_nograce_recovery_ops,
 	.state_renewal_ops = &nfs40_state_renewal_ops,
+	.mig_recovery_ops = &nfs40_mig_recovery_ops,
 };
 
 #if defined(CONFIG_NFS_V4_1)
@@ -7916,6 +7925,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
 	.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
 	.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
 	.state_renewal_ops = &nfs41_state_renewal_ops,
+	.mig_recovery_ops = &nfs41_mig_recovery_ops,
 };
 #endif
 
-- 
cgit v1.3


From 9e6ee76dfb7cd747fac5679542a9b45930173eec Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:12:45 -0400
Subject: NFS: Export _nfs_display_fhandle()

Allow code in nfsv4.ko to use _nfs_display_fhandle().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c      | 2 ++
 fs/nfs/nfs4client.c | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index eda8879171c4..4bc7538f951e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1209,6 +1209,7 @@ u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh)
 	 * not on the result */
 	return nfs_fhandle_hash(fh);
 }
+EXPORT_SYMBOL_GPL(_nfs_display_fhandle_hash);
 
 /*
  * _nfs_display_fhandle - display an NFS file handle on the console
@@ -1253,6 +1254,7 @@ void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption)
 		}
 	}
 }
+EXPORT_SYMBOL_GPL(_nfs_display_fhandle);
 #endif
 
 /**
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index c6eee812337f..a2cba66f8345 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -925,7 +925,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
 	dprintk("Server FSID: %llx:%llx\n",
 			(unsigned long long) server->fsid.major,
 			(unsigned long long) server->fsid.minor);
-	dprintk("Mount FH: %d\n", mntfh->size);
+	nfs_display_fhandle(mntfh, "Pseudo-fs root FH");
 
 	nfs4_session_set_rwsize(server);
 
-- 
cgit v1.3


From b03d735b4ca2375d2251195cd848713bc55e7d79 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:12:50 -0400
Subject: NFS: Add method to retrieve fs_locations during migration recovery

The nfs4_proc_fs_locations() function is invoked during referral
processing to perform a GETATTR(fs_locations) on an object's parent
directory in order to discover the target of the referral.  It
performs a LOOKUP in the compound, so the client needs to know the
parent's file handle a priori.

Unfortunately this function is not adequate for handling migration
recovery.  We need to probe fs_locations information on an FSID, but
there's no parent directory available for many operations that
can return NFS4ERR_MOVED.

Another subtlety: recovering from NFS4ERR_LEASE_MOVED is a process
of walking over a list of known FSIDs that reside on the server, and
probing whether they have migrated.  Once the server has detected
that the client has probed all migrated file systems, it stops
returning NFS4ERR_LEASE_MOVED.

A minor version zero server needs to know what client ID is
requesting fs_locations information so it can clear the flag that
forces it to continue returning NFS4ERR_LEASE_MOVED.  This flag is
set per client ID and per FSID.  However, the client ID is not an
argument of either the PUTFH or GETATTR operations.  Later minor
versions have client ID information embedded in the compound's
SEQUENCE operation.

Therefore, by convention, minor version zero clients send a RENEW
operation in the same compound as the GETATTR(fs_locations), since
RENEW's one argument is a clientid4.  This allows a minor version
zero server to identify correctly the client that is probing for a
migration.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h        |   4 ++
 fs/nfs/nfs4proc.c       | 153 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/nfs4xdr.c        |  46 +++++++++++----
 include/linux/nfs_xdr.h |   4 ++
 4 files changed, 196 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 4f48000b2b97..e59d3b4c7944 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -205,6 +205,8 @@ struct nfs4_state_maintenance_ops {
 };
 
 struct nfs4_mig_recovery_ops {
+	int (*get_locations)(struct inode *, struct nfs4_fs_locations *,
+		struct page *, struct rpc_cred *);
 };
 
 extern const struct dentry_operations nfs4_dentry_operations;
@@ -237,6 +239,8 @@ extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *,
 				  struct nfs4_fs_locations *, struct page *);
+extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *,
+		struct page *page, struct rpc_cred *);
 extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *,
 			    struct nfs_fh *, struct nfs_fattr *);
 extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2aacd1357dc5..c71c16e0ac22 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5984,6 +5984,157 @@ int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
 	return err;
 }
 
+/*
+ * This operation also signals the server that this client is
+ * performing migration recovery.  The server can stop returning
+ * NFS4ERR_LEASE_MOVED to this client.  A RENEW operation is
+ * appended to this compound to identify the client ID which is
+ * performing recovery.
+ */
+static int _nfs40_proc_get_locations(struct inode *inode,
+				     struct nfs4_fs_locations *locations,
+				     struct page *page, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct rpc_clnt *clnt = server->client;
+	u32 bitmask[2] = {
+		[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
+	};
+	struct nfs4_fs_locations_arg args = {
+		.clientid	= server->nfs_client->cl_clientid,
+		.fh		= NFS_FH(inode),
+		.page		= page,
+		.bitmask	= bitmask,
+		.migration	= 1,		/* skip LOOKUP */
+		.renew		= 1,		/* append RENEW */
+	};
+	struct nfs4_fs_locations_res res = {
+		.fs_locations	= locations,
+		.migration	= 1,
+		.renew		= 1,
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+		.rpc_argp	= &args,
+		.rpc_resp	= &res,
+		.rpc_cred	= cred,
+	};
+	unsigned long now = jiffies;
+	int status;
+
+	nfs_fattr_init(&locations->fattr);
+	locations->server = server;
+	locations->nlocations = 0;
+
+	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
+	nfs4_set_sequence_privileged(&args.seq_args);
+	status = nfs4_call_sync_sequence(clnt, server, &msg,
+					&args.seq_args, &res.seq_res);
+	if (status)
+		return status;
+
+	renew_lease(server, now);
+	return 0;
+}
+
+#ifdef CONFIG_NFS_V4_1
+
+/*
+ * This operation also signals the server that this client is
+ * performing migration recovery.  The server can stop asserting
+ * SEQ4_STATUS_LEASE_MOVED for this client.  The client ID
+ * performing this operation is identified in the SEQUENCE
+ * operation in this compound.
+ *
+ * When the client supports GETATTR(fs_locations_info), it can
+ * be plumbed in here.
+ */
+static int _nfs41_proc_get_locations(struct inode *inode,
+				     struct nfs4_fs_locations *locations,
+				     struct page *page, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct rpc_clnt *clnt = server->client;
+	u32 bitmask[2] = {
+		[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
+	};
+	struct nfs4_fs_locations_arg args = {
+		.fh		= NFS_FH(inode),
+		.page		= page,
+		.bitmask	= bitmask,
+		.migration	= 1,		/* skip LOOKUP */
+	};
+	struct nfs4_fs_locations_res res = {
+		.fs_locations	= locations,
+		.migration	= 1,
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+		.rpc_argp	= &args,
+		.rpc_resp	= &res,
+		.rpc_cred	= cred,
+	};
+	int status;
+
+	nfs_fattr_init(&locations->fattr);
+	locations->server = server;
+	locations->nlocations = 0;
+
+	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
+	nfs4_set_sequence_privileged(&args.seq_args);
+	status = nfs4_call_sync_sequence(clnt, server, &msg,
+					&args.seq_args, &res.seq_res);
+	if (status == NFS4_OK &&
+	    res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED)
+		status = -NFS4ERR_LEASE_MOVED;
+	return status;
+}
+
+#endif	/* CONFIG_NFS_V4_1 */
+
+/**
+ * nfs4_proc_get_locations - discover locations for a migrated FSID
+ * @inode: inode on FSID that is migrating
+ * @locations: result of query
+ * @page: buffer
+ * @cred: credential to use for this operation
+ *
+ * Returns NFS4_OK on success, a negative NFS4ERR status code if the
+ * operation failed, or a negative errno if a local error occurred.
+ *
+ * On success, "locations" is filled in, but if the server has
+ * no locations information, NFS_ATTR_FATTR_V4_LOCATIONS is not
+ * asserted.
+ *
+ * -NFS4ERR_LEASE_MOVED is returned if the server still has leases
+ * from this client that require migration recovery.
+ */
+int nfs4_proc_get_locations(struct inode *inode,
+			    struct nfs4_fs_locations *locations,
+			    struct page *page, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_client *clp = server->nfs_client;
+	const struct nfs4_mig_recovery_ops *ops =
+					clp->cl_mvops->mig_recovery_ops;
+	struct nfs4_exception exception = { };
+	int status;
+
+	dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__,
+		(unsigned long long)server->fsid.major,
+		(unsigned long long)server->fsid.minor,
+		clp->cl_hostname);
+	nfs_display_fhandle(NFS_FH(inode), __func__);
+
+	do {
+		status = ops->get_locations(inode, locations, page, cred);
+		if (status != -NFS4ERR_DELAY)
+			break;
+		nfs4_handle_exception(server, status, &exception);
+	} while (exception.retry);
+	return status;
+}
+
 /**
  * If 'use_integrity' is true and the state managment nfs_client
  * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient
@@ -7882,10 +8033,12 @@ static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
 #endif
 
 static const struct nfs4_mig_recovery_ops nfs40_mig_recovery_ops = {
+	.get_locations = _nfs40_proc_get_locations,
 };
 
 #if defined(CONFIG_NFS_V4_1)
 static const struct nfs4_mig_recovery_ops nfs41_mig_recovery_ops = {
+	.get_locations = _nfs41_proc_get_locations,
 };
 #endif	/* CONFIG_NFS_V4_1 */
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 79210d23f607..1854b04f828f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -736,13 +736,15 @@ static int nfs4_stat_to_errno(int);
 				 encode_sequence_maxsz + \
 				 encode_putfh_maxsz + \
 				 encode_lookup_maxsz + \
-				 encode_fs_locations_maxsz)
+				 encode_fs_locations_maxsz + \
+				 encode_renew_maxsz)
 #define NFS4_dec_fs_locations_sz \
 				(compound_decode_hdr_maxsz + \
 				 decode_sequence_maxsz + \
 				 decode_putfh_maxsz + \
 				 decode_lookup_maxsz + \
-				 decode_fs_locations_maxsz)
+				 decode_fs_locations_maxsz + \
+				 decode_renew_maxsz)
 #define NFS4_enc_secinfo_sz 	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
@@ -2687,11 +2689,20 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
 
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
-	encode_putfh(xdr, args->dir_fh, &hdr);
-	encode_lookup(xdr, args->name, &hdr);
-	replen = hdr.replen;	/* get the attribute into args->page */
-	encode_fs_locations(xdr, args->bitmask, &hdr);
+	if (args->migration) {
+		encode_putfh(xdr, args->fh, &hdr);
+		replen = hdr.replen;
+		encode_fs_locations(xdr, args->bitmask, &hdr);
+		if (args->renew)
+			encode_renew(xdr, args->clientid, &hdr);
+	} else {
+		encode_putfh(xdr, args->dir_fh, &hdr);
+		encode_lookup(xdr, args->name, &hdr);
+		replen = hdr.replen;
+		encode_fs_locations(xdr, args->bitmask, &hdr);
+	}
 
+	/* Set up reply kvec to capture returned fs_locations array. */
 	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page,
 			0, PAGE_SIZE);
 	encode_nops(&hdr);
@@ -6824,13 +6835,26 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
 	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_lookup(xdr);
-	if (status)
-		goto out;
-	xdr_enter_page(xdr, PAGE_SIZE);
-	status = decode_getfattr_generic(xdr, &res->fs_locations->fattr,
+	if (res->migration) {
+		xdr_enter_page(xdr, PAGE_SIZE);
+		status = decode_getfattr_generic(xdr,
+					&res->fs_locations->fattr,
 					 NULL, res->fs_locations,
 					 NULL, res->fs_locations->server);
+		if (status)
+			goto out;
+		if (res->renew)
+			status = decode_renew(xdr);
+	} else {
+		status = decode_lookup(xdr);
+		if (status)
+			goto out;
+		xdr_enter_page(xdr, PAGE_SIZE);
+		status = decode_getfattr_generic(xdr,
+					&res->fs_locations->fattr,
+					 NULL, res->fs_locations,
+					 NULL, res->fs_locations->server);
+	}
 out:
 	return status;
 }
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 49f52c8f4422..405dfadcbc3b 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1053,14 +1053,18 @@ struct nfs4_fs_locations {
 struct nfs4_fs_locations_arg {
 	struct nfs4_sequence_args	seq_args;
 	const struct nfs_fh *dir_fh;
+	const struct nfs_fh *fh;
 	const struct qstr *name;
 	struct page *page;
 	const u32 *bitmask;
+	clientid4 clientid;
+	unsigned char migration:1, renew:1;
 };
 
 struct nfs4_fs_locations_res {
 	struct nfs4_sequence_res	seq_res;
 	struct nfs4_fs_locations       *fs_locations;
+	unsigned char			migration:1, renew:1;
 };
 
 struct nfs4_secinfo4 {
-- 
cgit v1.3


From ce6cda1845cf2332d2c411a5c72cd166256b21da Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:12:56 -0400
Subject: NFS: Add a super_block backpointer to the nfs_server struct

NFS_SB() returns the pointer to an nfs_server struct, given a
pointer to a super_block.  But we have no way to go back the other
way.

Add a super_block backpointer field so that, given an nfs_server
struct, it is easy to get to the filesystem's root dentry.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c            | 1 +
 include/linux/nfs_fs_sb.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 3f5a7a85c9c2..e26647be69ce 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2534,6 +2534,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
 			mntroot = ERR_PTR(error);
 			goto error_splat_bdi;
 		}
+		server->super = s;
 	}
 
 	if (!s->s_root) {
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index f9c0a6cb41e9..46b0cb4bee09 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -148,6 +148,7 @@ struct nfs_server {
 	__u64			maxfilesize;	/* maximum file size */
 	struct timespec		time_delta;	/* smallest time granularity */
 	unsigned long		mount_time;	/* when this fs was mounted */
+	struct super_block	*super;		/* VFS super block */
 	dev_t			s_dev;		/* superblock dev numbers */
 
 #ifdef CONFIG_NFS_FSCACHE
-- 
cgit v1.3


From c9fdeb280b8cc511c6730db9ab3973331ea344e0 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:13:02 -0400
Subject: NFS: Add basic migration support to state manager thread

Migration recovery and state recovery must be serialized, so handle
both in the state manager thread.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h          |   2 +
 fs/nfs/nfs4client.c       |   1 +
 fs/nfs/nfs4state.c        | 161 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/nfs_fs_sb.h |   7 ++
 4 files changed, 168 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index e59d3b4c7944..94e783ffcf08 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -29,6 +29,7 @@ enum nfs4_client_state {
 	NFS4CLNT_SERVER_SCOPE_MISMATCH,
 	NFS4CLNT_PURGE_STATE,
 	NFS4CLNT_BIND_CONN_TO_SESSION,
+	NFS4CLNT_MOVED,
 };
 
 #define NFS4_RENEW_TIMEOUT		0x01
@@ -421,6 +422,7 @@ extern int nfs4_client_recover_expired_lease(struct nfs_client *clp);
 extern void nfs4_schedule_state_manager(struct nfs_client *);
 extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
 extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
+extern int nfs4_schedule_migration_recovery(const struct nfs_server *);
 extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
 extern void nfs41_handle_server_scope(struct nfs_client *,
 				      struct nfs41_server_scope **);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index a2cba66f8345..0cde95e9bd6a 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -197,6 +197,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
 	clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
 	clp->cl_minorversion = cl_init->minorversion;
 	clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
+	clp->cl_mig_gen = 1;
 	return clp;
 
 error:
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index fa2706a4225a..ba0db18b5f67 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -239,8 +239,6 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
 	}
 }
 
-#if defined(CONFIG_NFS_V4_1)
-
 static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl)
 {
 	set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
@@ -270,6 +268,8 @@ static int nfs4_begin_drain_session(struct nfs_client *clp)
 	return nfs4_drain_slot_tbl(&ses->fc_slot_table);
 }
 
+#if defined(CONFIG_NFS_V4_1)
+
 static int nfs41_setup_state_renewal(struct nfs_client *clp)
 {
 	int status;
@@ -1197,6 +1197,42 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery);
 
+/**
+ * nfs4_schedule_migration_recovery - trigger migration recovery
+ *
+ * @server: FSID that is migrating
+ *
+ * Returns zero if recovery has started, otherwise a negative NFS4ERR
+ * value is returned.
+ */
+int nfs4_schedule_migration_recovery(const struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+
+	if (server->fh_expire_type != NFS4_FH_PERSISTENT) {
+		pr_err("NFS: volatile file handles not supported (server %s)\n",
+				clp->cl_hostname);
+		return -NFS4ERR_IO;
+	}
+
+	if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+		return -NFS4ERR_IO;
+
+	dprintk("%s: scheduling migration recovery for (%llx:%llx) on %s\n",
+			__func__,
+			(unsigned long long)server->fsid.major,
+			(unsigned long long)server->fsid.minor,
+			clp->cl_hostname);
+
+	set_bit(NFS_MIG_IN_TRANSITION,
+			&((struct nfs_server *)server)->mig_status);
+	set_bit(NFS4CLNT_MOVED, &clp->cl_state);
+
+	nfs4_schedule_state_manager(clp);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_migration_recovery);
+
 int nfs4_wait_clnt_recover(struct nfs_client *clp)
 {
 	int res;
@@ -1826,6 +1862,119 @@ static int nfs4_purge_lease(struct nfs_client *clp)
 	return 0;
 }
 
+/*
+ * Try remote migration of one FSID from a source server to a
+ * destination server.  The source server provides a list of
+ * potential destinations.
+ *
+ * Returns zero or a negative NFS4ERR status code.
+ */
+static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_fs_locations *locations = NULL;
+	struct inode *inode;
+	struct page *page;
+	int status, result;
+
+	dprintk("--> %s: FSID %llx:%llx on \"%s\"\n", __func__,
+			(unsigned long long)server->fsid.major,
+			(unsigned long long)server->fsid.minor,
+			clp->cl_hostname);
+
+	result = 0;
+	page = alloc_page(GFP_KERNEL);
+	locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+	if (page == NULL || locations == NULL) {
+		dprintk("<-- %s: no memory\n", __func__);
+		goto out;
+	}
+
+	inode = server->super->s_root->d_inode;
+	result = nfs4_proc_get_locations(inode, locations, page, cred);
+	if (result) {
+		dprintk("<-- %s: failed to retrieve fs_locations: %d\n",
+			__func__, result);
+		goto out;
+	}
+
+	result = -NFS4ERR_NXIO;
+	if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) {
+		dprintk("<-- %s: No fs_locations data, migration skipped\n",
+			__func__);
+		goto out;
+	}
+
+	nfs4_begin_drain_session(clp);
+
+	status = nfs4_replace_transport(server, locations);
+	if (status != 0) {
+		dprintk("<-- %s: failed to replace transport: %d\n",
+			__func__, status);
+		goto out;
+	}
+
+	result = 0;
+	dprintk("<-- %s: migration succeeded\n", __func__);
+
+out:
+	if (page != NULL)
+		__free_page(page);
+	kfree(locations);
+	if (result) {
+		pr_err("NFS: migration recovery failed (server %s)\n",
+				clp->cl_hostname);
+		set_bit(NFS_MIG_FAILED, &server->mig_status);
+	}
+	return result;
+}
+
+/*
+ * Returns zero or a negative NFS4ERR status code.
+ */
+static int nfs4_handle_migration(struct nfs_client *clp)
+{
+	const struct nfs4_state_maintenance_ops *ops =
+				clp->cl_mvops->state_renewal_ops;
+	struct nfs_server *server;
+	struct rpc_cred *cred;
+
+	dprintk("%s: migration reported on \"%s\"\n", __func__,
+			clp->cl_hostname);
+
+	spin_lock(&clp->cl_lock);
+	cred = ops->get_state_renewal_cred_locked(clp);
+	spin_unlock(&clp->cl_lock);
+	if (cred == NULL)
+		return -NFS4ERR_NOENT;
+
+	clp->cl_mig_gen++;
+restart:
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		int status;
+
+		if (server->mig_gen == clp->cl_mig_gen)
+			continue;
+		server->mig_gen = clp->cl_mig_gen;
+
+		if (!test_and_clear_bit(NFS_MIG_IN_TRANSITION,
+						&server->mig_status))
+			continue;
+
+		rcu_read_unlock();
+		status = nfs4_try_migration(server, cred);
+		if (status < 0) {
+			put_rpccred(cred);
+			return status;
+		}
+		goto restart;
+	}
+	rcu_read_unlock();
+	put_rpccred(cred);
+	return 0;
+}
+
 /**
  * nfs4_discover_server_trunking - Detect server IP address trunking
  *
@@ -2154,7 +2303,13 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			status = nfs4_check_lease(clp);
 			if (status < 0)
 				goto out_error;
-			continue;
+		}
+
+		if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) {
+			section = "migration";
+			status = nfs4_handle_migration(clp);
+			if (status < 0)
+				goto out_error;
 		}
 
 		/* First recover reboot state... */
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 46b0cb4bee09..186ec4969313 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -79,6 +79,7 @@ struct nfs_client {
 	char			cl_ipaddr[48];
 	u32			cl_cb_ident;	/* v4.0 callback identifier */
 	const struct nfs4_minor_version_ops *cl_mvops;
+	unsigned long		cl_mig_gen;
 
 	/* NFSv4.0 transport blocking */
 	struct nfs4_slot_table	*cl_slot_tbl;
@@ -189,6 +190,12 @@ struct nfs_server {
 	struct list_head	state_owners_lru;
 	struct list_head	layouts;
 	struct list_head	delegations;
+
+	unsigned long		mig_gen;
+	unsigned long		mig_status;
+#define NFS_MIG_IN_TRANSITION		(1)
+#define NFS_MIG_FAILED			(2)
+
 	void (*destroy)(struct nfs_server *);
 
 	atomic_t active; /* Keep trace of any activity to this server */
-- 
cgit v1.3


From f1478c13c0a606b960becaa70e82065835b16fbf Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:13:07 -0400
Subject: NFS: Re-use exit code in nfs4_async_handle_error()

Clean up.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c71c16e0ac22..2614b46a0cc5 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4750,19 +4750,15 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 			dprintk("%s ERROR %d, Reset session\n", __func__,
 				task->tk_status);
 			nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
-			task->tk_status = 0;
-			return -EAGAIN;
+			goto restart_call;
 #endif /* CONFIG_NFS_V4_1 */
 		case -NFS4ERR_DELAY:
 			nfs_inc_server_stats(server, NFSIOS_DELAY);
 		case -NFS4ERR_GRACE:
 			rpc_delay(task, NFS4_POLL_RETRY_MAX);
-			task->tk_status = 0;
-			return -EAGAIN;
 		case -NFS4ERR_RETRY_UNCACHED_REP:
 		case -NFS4ERR_OLD_STATEID:
-			task->tk_status = 0;
-			return -EAGAIN;
+			goto restart_call;
 	}
 	task->tk_status = nfs4_map_errors(task->tk_status);
 	return 0;
@@ -4773,6 +4769,7 @@ wait_on_recovery:
 	rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
 	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
 		rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
+restart_call:
 	task->tk_status = 0;
 	return -EAGAIN;
 }
-- 
cgit v1.3


From 9f51a78e3a637a5923c661d2abe958c63441696f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:13:13 -0400
Subject: NFS: Rename "stateid_invalid" label

I'm going to use this exit label also for migration recovery
failures.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2614b46a0cc5..fa87f81527b3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4728,12 +4728,12 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 			if (state == NULL)
 				break;
 			if (nfs4_schedule_stateid_recovery(server, state) < 0)
-				goto stateid_invalid;
+				goto recovery_failed;
 			goto wait_on_recovery;
 		case -NFS4ERR_EXPIRED:
 			if (state != NULL) {
 				if (nfs4_schedule_stateid_recovery(server, state) < 0)
-					goto stateid_invalid;
+					goto recovery_failed;
 			}
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_STALE_CLIENTID:
@@ -4762,7 +4762,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 	}
 	task->tk_status = nfs4_map_errors(task->tk_status);
 	return 0;
-stateid_invalid:
+recovery_failed:
 	task->tk_status = -EIO;
 	return 0;
 wait_on_recovery:
-- 
cgit v1.3


From 519ae255d403219fd5375de13c95dea3ef1cfda0 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:13:19 -0400
Subject: NFS: Add migration recovery callouts in nfs4proc.c

When a server returns NFS4ERR_MOVED, trigger the new migration
recovery logic in the state manager.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index fa87f81527b3..a1f620950f11 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -384,6 +384,11 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
 		case -NFS4ERR_STALE_CLIENTID:
 			nfs4_schedule_lease_recovery(clp);
 			goto wait_on_recovery;
+		case -NFS4ERR_MOVED:
+			ret = nfs4_schedule_migration_recovery(server);
+			if (ret < 0)
+				break;
+			goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
 		case -NFS4ERR_BADSESSION:
 		case -NFS4ERR_BADSLOT:
@@ -431,6 +436,8 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
 	return nfs4_map_errors(ret);
 wait_on_recovery:
 	ret = nfs4_wait_clnt_recover(clp);
+	if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+		return -EIO;
 	if (ret == 0)
 		exception->retry = 1;
 	return ret;
@@ -2974,11 +2981,16 @@ static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir,
 	status = nfs4_proc_fs_locations(client, dir, name, locations, page);
 	if (status != 0)
 		goto out;
-	/* Make sure server returned a different fsid for the referral */
+
+	/*
+	 * If the fsid didn't change, this is a migration event, not a
+	 * referral.  Cause us to drop into the exception handler, which
+	 * will kick off migration recovery.
+	 */
 	if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) {
 		dprintk("%s: server did not return a different fsid for"
 			" a referral at %s\n", __func__, name->name);
-		status = -EIO;
+		status = -NFS4ERR_MOVED;
 		goto out;
 	}
 	/* Fixup attributes for the nfs_lookup() call to nfs_fhget() */
@@ -4739,6 +4751,10 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 		case -NFS4ERR_STALE_CLIENTID:
 			nfs4_schedule_lease_recovery(clp);
 			goto wait_on_recovery;
+		case -NFS4ERR_MOVED:
+			if (nfs4_schedule_migration_recovery(server) < 0)
+				goto recovery_failed;
+			goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
 		case -NFS4ERR_BADSESSION:
 		case -NFS4ERR_BADSLOT:
@@ -4769,6 +4785,8 @@ wait_on_recovery:
 	rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
 	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
 		rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
+	if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+		goto recovery_failed;
 restart_call:
 	task->tk_status = 0;
 	return -EAGAIN;
-- 
cgit v1.3


From 352297b917d8a3e61a778ffb0f0195ce8550d7f5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:13:24 -0400
Subject: NFS: Handle NFS4ERR_MOVED during delegation recall

When a server returns NFS4ERR_MOVED during a delegation recall,
trigger the new migration recovery logic in the state manager.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a1f620950f11..552e4f7a8225 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1575,6 +1575,9 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
 			/* Don't recall a delegation if it was lost */
 			nfs4_schedule_lease_recovery(server->nfs_client);
 			return -EAGAIN;
+		case -NFS4ERR_MOVED:
+			nfs4_schedule_migration_recovery(server);
+			return -EAGAIN;
 		case -NFS4ERR_DELEG_REVOKED:
 		case -NFS4ERR_ADMIN_REVOKED:
 		case -NFS4ERR_BAD_STATEID:
-- 
cgit v1.3


From 44c9993384e9311cd56acf6ead3baffab616ae50 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:13:30 -0400
Subject: NFS: Add method to detect whether an FSID is still on the server

Introduce a mechanism for probing a server to determine if an FSID
is present or absent.

The on-the-wire compound is different between minor version 0 and 1.
Minor version 0 appends a RENEW operation to identify which client
ID is probing.  Minor version 1 has a SEQUENCE operation in the
compound which effectively carries the same information.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h        |   2 +
 fs/nfs/nfs4proc.c       | 128 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/nfs4xdr.c        |  67 ++++++++++++++++++++++++-
 include/linux/nfs4.h    |   1 +
 include/linux/nfs_xdr.h |  13 +++++
 5 files changed, 209 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 94e783ffcf08..2f0f8c216441 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -208,6 +208,7 @@ struct nfs4_state_maintenance_ops {
 struct nfs4_mig_recovery_ops {
 	int (*get_locations)(struct inode *, struct nfs4_fs_locations *,
 		struct page *, struct rpc_cred *);
+	int (*fsid_present)(struct inode *, struct rpc_cred *);
 };
 
 extern const struct dentry_operations nfs4_dentry_operations;
@@ -242,6 +243,7 @@ extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struc
 				  struct nfs4_fs_locations *, struct page *);
 extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *,
 		struct page *page, struct rpc_cred *);
+extern int nfs4_proc_fsid_present(struct inode *, struct rpc_cred *);
 extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *,
 			    struct nfs_fh *, struct nfs_fattr *);
 extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 552e4f7a8225..01b90bd341bb 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6153,6 +6153,132 @@ int nfs4_proc_get_locations(struct inode *inode,
 	return status;
 }
 
+/*
+ * This operation also signals the server that this client is
+ * performing "lease moved" recovery.  The server can stop
+ * returning NFS4ERR_LEASE_MOVED to this client.  A RENEW operation
+ * is appended to this compound to identify the client ID which is
+ * performing recovery.
+ */
+static int _nfs40_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+	struct rpc_clnt *clnt = server->client;
+	struct nfs4_fsid_present_arg args = {
+		.fh		= NFS_FH(inode),
+		.clientid	= clp->cl_clientid,
+		.renew		= 1,		/* append RENEW */
+	};
+	struct nfs4_fsid_present_res res = {
+		.renew		= 1,
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT],
+		.rpc_argp	= &args,
+		.rpc_resp	= &res,
+		.rpc_cred	= cred,
+	};
+	unsigned long now = jiffies;
+	int status;
+
+	res.fh = nfs_alloc_fhandle();
+	if (res.fh == NULL)
+		return -ENOMEM;
+
+	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
+	nfs4_set_sequence_privileged(&args.seq_args);
+	status = nfs4_call_sync_sequence(clnt, server, &msg,
+						&args.seq_args, &res.seq_res);
+	nfs_free_fhandle(res.fh);
+	if (status)
+		return status;
+
+	do_renew_lease(clp, now);
+	return 0;
+}
+
+#ifdef CONFIG_NFS_V4_1
+
+/*
+ * This operation also signals the server that this client is
+ * performing "lease moved" recovery.  The server can stop asserting
+ * SEQ4_STATUS_LEASE_MOVED for this client.  The client ID performing
+ * this operation is identified in the SEQUENCE operation in this
+ * compound.
+ */
+static int _nfs41_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct rpc_clnt *clnt = server->client;
+	struct nfs4_fsid_present_arg args = {
+		.fh		= NFS_FH(inode),
+	};
+	struct nfs4_fsid_present_res res = {
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT],
+		.rpc_argp	= &args,
+		.rpc_resp	= &res,
+		.rpc_cred	= cred,
+	};
+	int status;
+
+	res.fh = nfs_alloc_fhandle();
+	if (res.fh == NULL)
+		return -ENOMEM;
+
+	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
+	nfs4_set_sequence_privileged(&args.seq_args);
+	status = nfs4_call_sync_sequence(clnt, server, &msg,
+						&args.seq_args, &res.seq_res);
+	nfs_free_fhandle(res.fh);
+	if (status == NFS4_OK &&
+	    res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED)
+		status = -NFS4ERR_LEASE_MOVED;
+	return status;
+}
+
+#endif	/* CONFIG_NFS_V4_1 */
+
+/**
+ * nfs4_proc_fsid_present - Is this FSID present or absent on server?
+ * @inode: inode on FSID to check
+ * @cred: credential to use for this operation
+ *
+ * Server indicates whether the FSID is present, moved, or not
+ * recognized.  This operation is necessary to clear a LEASE_MOVED
+ * condition for this client ID.
+ *
+ * Returns NFS4_OK if the FSID is present on this server,
+ * -NFS4ERR_MOVED if the FSID is no longer present, a negative
+ *  NFS4ERR code if some error occurred on the server, or a
+ *  negative errno if a local failure occurred.
+ */
+int nfs4_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_client *clp = server->nfs_client;
+	const struct nfs4_mig_recovery_ops *ops =
+					clp->cl_mvops->mig_recovery_ops;
+	struct nfs4_exception exception = { };
+	int status;
+
+	dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__,
+		(unsigned long long)server->fsid.major,
+		(unsigned long long)server->fsid.minor,
+		clp->cl_hostname);
+	nfs_display_fhandle(NFS_FH(inode), __func__);
+
+	do {
+		status = ops->fsid_present(inode, cred);
+		if (status != -NFS4ERR_DELAY)
+			break;
+		nfs4_handle_exception(server, status, &exception);
+	} while (exception.retry);
+	return status;
+}
+
 /**
  * If 'use_integrity' is true and the state managment nfs_client
  * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient
@@ -8052,11 +8178,13 @@ static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
 
 static const struct nfs4_mig_recovery_ops nfs40_mig_recovery_ops = {
 	.get_locations = _nfs40_proc_get_locations,
+	.fsid_present = _nfs40_proc_fsid_present,
 };
 
 #if defined(CONFIG_NFS_V4_1)
 static const struct nfs4_mig_recovery_ops nfs41_mig_recovery_ops = {
 	.get_locations = _nfs41_proc_get_locations,
+	.fsid_present = _nfs41_proc_fsid_present,
 };
 #endif	/* CONFIG_NFS_V4_1 */
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1854b04f828f..f903389d90f1 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -595,11 +595,13 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_enc_getattr_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
-				encode_getattr_maxsz)
+				encode_getattr_maxsz + \
+				encode_renew_maxsz)
 #define NFS4_dec_getattr_sz	(compound_decode_hdr_maxsz + \
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
-				decode_getattr_maxsz)
+				decode_getattr_maxsz + \
+				decode_renew_maxsz)
 #define NFS4_enc_lookup_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
@@ -753,6 +755,18 @@ static int nfs4_stat_to_errno(int);
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
 				decode_secinfo_maxsz)
+#define NFS4_enc_fsid_present_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_sequence_maxsz + \
+				 encode_putfh_maxsz + \
+				 encode_getfh_maxsz + \
+				 encode_renew_maxsz)
+#define NFS4_dec_fsid_present_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_sequence_maxsz + \
+				 decode_putfh_maxsz + \
+				 decode_getfh_maxsz + \
+				 decode_renew_maxsz)
 #if defined(CONFIG_NFS_V4_1)
 #define NFS4_enc_bind_conn_to_session_sz \
 				(compound_encode_hdr_maxsz + \
@@ -2726,6 +2740,26 @@ static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req,
 	encode_nops(&hdr);
 }
 
+/*
+ * Encode FSID_PRESENT request
+ */
+static void nfs4_xdr_enc_fsid_present(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      struct nfs4_fsid_present_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_getfh(xdr, &hdr);
+	if (args->renew)
+		encode_renew(xdr, args->clientid, &hdr);
+	encode_nops(&hdr);
+}
+
 #if defined(CONFIG_NFS_V4_1)
 /*
  * BIND_CONN_TO_SESSION request
@@ -6883,6 +6917,34 @@ out:
 	return status;
 }
 
+/*
+ * Decode FSID_PRESENT response
+ */
+static int nfs4_xdr_dec_fsid_present(struct rpc_rqst *rqstp,
+				     struct xdr_stream *xdr,
+				     struct nfs4_fsid_present_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_getfh(xdr, res->fh);
+	if (status)
+		goto out;
+	if (res->renew)
+		status = decode_renew(xdr);
+out:
+	return status;
+}
+
 #if defined(CONFIG_NFS_V4_1)
 /*
  * Decode BIND_CONN_TO_SESSION response
@@ -7397,6 +7459,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
 	PROC(FS_LOCATIONS,	enc_fs_locations,	dec_fs_locations),
 	PROC(RELEASE_LOCKOWNER,	enc_release_lockowner,	dec_release_lockowner),
 	PROC(SECINFO,		enc_secinfo,		dec_secinfo),
+	PROC(FSID_PRESENT,	enc_fsid_present,	dec_fsid_present),
 #if defined(CONFIG_NFS_V4_1)
 	PROC(EXCHANGE_ID,	enc_exchange_id,	dec_exchange_id),
 	PROC(CREATE_SESSION,	enc_create_session,	dec_create_session),
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index e36dee52f224..c56fa8fedce9 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -460,6 +460,7 @@ enum {
 	NFSPROC4_CLNT_FS_LOCATIONS,
 	NFSPROC4_CLNT_RELEASE_LOCKOWNER,
 	NFSPROC4_CLNT_SECINFO,
+	NFSPROC4_CLNT_FSID_PRESENT,
 
 	/* nfs41 */
 	NFSPROC4_CLNT_EXCHANGE_ID,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 405dfadcbc3b..8fe5b940c5f2 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1088,6 +1088,19 @@ struct nfs4_secinfo_res {
 	struct nfs4_secinfo_flavors	*flavors;
 };
 
+struct nfs4_fsid_present_arg {
+	struct nfs4_sequence_args	seq_args;
+	const struct nfs_fh		*fh;
+	clientid4			clientid;
+	unsigned char			renew:1;
+};
+
+struct nfs4_fsid_present_res {
+	struct nfs4_sequence_res	seq_res;
+	struct nfs_fh			*fh;
+	unsigned char			renew:1;
+};
+
 #endif /* CONFIG_NFS_V4 */
 
 struct nfstime4 {
-- 
cgit v1.3


From b7f7a66e420a9f3ec82a5124720013cee317e73a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:13:35 -0400
Subject: NFS: Support NFS4ERR_LEASE_MOVED recovery in state manager

A migration on the FSID in play for the current NFS operation
is reported via the error status code NFS4ERR_MOVED.

"Lease moved" means that a migration has occurred on some other
FSID than the one for the current operation.  It's a signal that
the client should take action immediately to handle a migration
that it may not have noticed otherwise.  This is so that the
client's lease does not expire unnoticed on the destination server.

In NFSv4.0, a moved lease is reported with the NFS4ERR_LEASE_MOVED
error status code.

To recover from NFS4ERR_LEASE_MOVED, check each FSID for that server
to see if it is still present.  Invoke nfs4_try_migration() if the
FSID is no longer present on the server.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h   |  2 ++
 fs/nfs/nfs4state.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 74 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 2f0f8c216441..210e44ed1d2a 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -30,6 +30,7 @@ enum nfs4_client_state {
 	NFS4CLNT_PURGE_STATE,
 	NFS4CLNT_BIND_CONN_TO_SESSION,
 	NFS4CLNT_MOVED,
+	NFS4CLNT_LEASE_MOVED,
 };
 
 #define NFS4_RENEW_TIMEOUT		0x01
@@ -425,6 +426,7 @@ extern void nfs4_schedule_state_manager(struct nfs_client *);
 extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
 extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
 extern int nfs4_schedule_migration_recovery(const struct nfs_server *);
+extern void nfs4_schedule_lease_moved_recovery(struct nfs_client *);
 extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
 extern void nfs41_handle_server_scope(struct nfs_client *,
 				      struct nfs41_server_scope **);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index ba0db18b5f67..552706d2d776 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1233,6 +1233,22 @@ int nfs4_schedule_migration_recovery(const struct nfs_server *server)
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_migration_recovery);
 
+/**
+ * nfs4_schedule_lease_moved_recovery - start lease-moved recovery
+ *
+ * @clp: server to check for moved leases
+ *
+ */
+void nfs4_schedule_lease_moved_recovery(struct nfs_client *clp)
+{
+	dprintk("%s: scheduling lease-moved recovery for client ID %llx on %s\n",
+		__func__, clp->cl_clientid, clp->cl_hostname);
+
+	set_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state);
+	nfs4_schedule_state_manager(clp);
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_lease_moved_recovery);
+
 int nfs4_wait_clnt_recover(struct nfs_client *clp)
 {
 	int res;
@@ -1661,7 +1677,6 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 			nfs4_state_end_reclaim_reboot(clp);
 			break;
 		case -NFS4ERR_STALE_CLIENTID:
-		case -NFS4ERR_LEASE_MOVED:
 			set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
 			nfs4_state_clear_reclaim_reboot(clp);
 			nfs4_state_start_reclaim_reboot(clp);
@@ -1975,6 +1990,55 @@ restart:
 	return 0;
 }
 
+/*
+ * Test each nfs_server on the clp's cl_superblocks list to see
+ * if it's moved to another server.  Stop when the server no longer
+ * returns NFS4ERR_LEASE_MOVED.
+ */
+static int nfs4_handle_lease_moved(struct nfs_client *clp)
+{
+	const struct nfs4_state_maintenance_ops *ops =
+				clp->cl_mvops->state_renewal_ops;
+	struct nfs_server *server;
+	struct rpc_cred *cred;
+
+	dprintk("%s: lease moved reported on \"%s\"\n", __func__,
+			clp->cl_hostname);
+
+	spin_lock(&clp->cl_lock);
+	cred = ops->get_state_renewal_cred_locked(clp);
+	spin_unlock(&clp->cl_lock);
+	if (cred == NULL)
+		return -NFS4ERR_NOENT;
+
+	clp->cl_mig_gen++;
+restart:
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		struct inode *inode;
+		int status;
+
+		if (server->mig_gen == clp->cl_mig_gen)
+			continue;
+		server->mig_gen = clp->cl_mig_gen;
+
+		rcu_read_unlock();
+
+		inode = server->super->s_root->d_inode;
+		status = nfs4_proc_fsid_present(inode, cred);
+		if (status != -NFS4ERR_MOVED)
+			goto restart;	/* wasn't this one */
+		if (nfs4_try_migration(server, cred) == -NFS4ERR_LEASE_MOVED)
+			goto restart;	/* there are more */
+		goto out;
+	}
+	rcu_read_unlock();
+
+out:
+	put_rpccred(cred);
+	return 0;
+}
+
 /**
  * nfs4_discover_server_trunking - Detect server IP address trunking
  *
@@ -2312,6 +2376,13 @@ static void nfs4_state_manager(struct nfs_client *clp)
 				goto out_error;
 		}
 
+		if (test_and_clear_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state)) {
+			section = "lease moved";
+			status = nfs4_handle_lease_moved(clp);
+			if (status < 0)
+				goto out_error;
+		}
+
 		/* First recover reboot state... */
 		if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
 			section = "reclaim reboot";
-- 
cgit v1.3


From 8ef2f8d46aca7ff2e7e2b926bb563212aa63a4af Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:13:41 -0400
Subject: NFS: Implement support for NFS4ERR_LEASE_MOVED

Trigger lease-moved recovery when a request returns
NFS4ERR_LEASE_MOVED.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 01b90bd341bb..2564e1c89f56 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -389,6 +389,9 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
 			if (ret < 0)
 				break;
 			goto wait_on_recovery;
+		case -NFS4ERR_LEASE_MOVED:
+			nfs4_schedule_lease_moved_recovery(clp);
+			goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
 		case -NFS4ERR_BADSESSION:
 		case -NFS4ERR_BADSLOT:
@@ -1578,6 +1581,9 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
 		case -NFS4ERR_MOVED:
 			nfs4_schedule_migration_recovery(server);
 			return -EAGAIN;
+		case -NFS4ERR_LEASE_MOVED:
+			nfs4_schedule_lease_moved_recovery(server->nfs_client);
+			return -EAGAIN;
 		case -NFS4ERR_DELEG_REVOKED:
 		case -NFS4ERR_ADMIN_REVOKED:
 		case -NFS4ERR_BAD_STATEID:
@@ -4758,6 +4764,9 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 			if (nfs4_schedule_migration_recovery(server) < 0)
 				goto recovery_failed;
 			goto wait_on_recovery;
+		case -NFS4ERR_LEASE_MOVED:
+			nfs4_schedule_lease_moved_recovery(clp);
+			goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
 		case -NFS4ERR_BADSESSION:
 		case -NFS4ERR_BADSLOT:
-- 
cgit v1.3


From 60ea68129942dc36cd1a3a9bdaec783369ee5a6d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:13:47 -0400
Subject: NFS: Migration support for RELEASE_LOCKOWNER

Currently the Linux NFS client ignores the operation status code for
the RELEASE_LOCKOWNER operation.  Like NFSv3's UMNT operation,
RELEASE_LOCKOWNER is a courtesy to help servers manage their
resources, and the outcome is not consequential for the client.

During a migration, a server may report NFS4ERR_LEASE_MOVED, in
which case the client really should retry, since typically
LEASE_MOVED has nothing to do with the current operation, but does
prevent it from going forward.

Also, it's important for a client to respond as soon as possible to
a moved lease condition, since the client's lease could expire on
the destination without further action by the client.

NFS4ERR_DELAY is not included in the list of valid status codes for
RELEASE_LOCKOWNER in RFC 3530bis.  However, rfc3530-migration-update
does permit migration-capable servers to return DELAY to clients,
but only in the context of an ongoing migration.  In this case the
server has frozen lock state in preparation for migration, and a
client retry would help the destination server purge unneeded state
once migration recovery is complete.

Interestly, NFS4ERR_MOVED is not valid for RELEASE_LOCKOWNER, even
though lock owners can be migrated with Transparent State Migration.

Note that RFC 3530bis section 9.5 includes RELEASE_LOCKOWNER in the
list of operations that renew a client's lease on the server if they
succeed.  Now that our client pays attention to the operation's
status code, we can note that renewal appropriately.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2564e1c89f56..9f2ccf7471ee 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5800,6 +5800,7 @@ struct nfs_release_lockowner_data {
 	struct nfs_release_lockowner_args args;
 	struct nfs4_sequence_args seq_args;
 	struct nfs4_sequence_res seq_res;
+	unsigned long timestamp;
 };
 
 static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata)
@@ -5807,12 +5808,27 @@ static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata
 	struct nfs_release_lockowner_data *data = calldata;
 	nfs40_setup_sequence(data->server,
 				&data->seq_args, &data->seq_res, task);
+	data->timestamp = jiffies;
 }
 
 static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs_release_lockowner_data *data = calldata;
+	struct nfs_server *server = data->server;
+
 	nfs40_sequence_done(task, &data->seq_res);
+
+	switch (task->tk_status) {
+	case 0:
+		renew_lease(server, data->timestamp);
+		break;
+	case -NFS4ERR_STALE_CLIENTID:
+	case -NFS4ERR_EXPIRED:
+	case -NFS4ERR_LEASE_MOVED:
+	case -NFS4ERR_DELAY:
+		if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN)
+			rpc_restart_call_prepare(task);
+	}
 }
 
 static void nfs4_release_lockowner_release(void *calldata)
-- 
cgit v1.3


From f8aba1e8d509c0db7a82893e595a7743ce07ea83 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:13:53 -0400
Subject: NFS: Handle NFS4ERR_LEASE_MOVED during async RENEW

With NFSv4 minor version 0, the asynchronous lease RENEW
heartbeat can return NFS4ERR_LEASE_MOVED.  Error recovery logic for
async RENEW is a separate code path from the generic NFS proc paths,
so it must be updated to handle NFS4ERR_LEASE_MOVED as well.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9f2ccf7471ee..8aa8ff3fd5e7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4235,7 +4235,13 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
 	unsigned long timestamp = data->timestamp;
 
 	trace_nfs4_renew_async(clp, task->tk_status);
-	if (task->tk_status < 0) {
+	switch (task->tk_status) {
+	case 0:
+		break;
+	case -NFS4ERR_LEASE_MOVED:
+		nfs4_schedule_lease_moved_recovery(clp);
+		break;
+	default:
 		/* Unless we're shutting down, schedule state recovery! */
 		if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0)
 			return;
-- 
cgit v1.3


From d1c2331e75d7a5a22e2910a8bd042ea90b543db1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:13:58 -0400
Subject: NFS: Handle SEQ4_STATUS_LEASE_MOVED

With the advent of NFSv4 sessions in NFSv4.1 and following, a "lease
moved" condition is reported differently than it is in NFSv4.0.

NFSv4 minor version 0 servers return an error status code,
NFS4ERR_LEASE_MOVED, to signal that a lease has moved.  This error
causes the whole compound operation to fail.  Normal compounds
against this server continue to fail until the client performs
migration recovery on the migrated share.

Minor version 1 and later servers assert a bit flag in the reply to
a compound's SEQUENCE operation to signal LEASE_MOVED.  This is not
a fatal condition: operations against this server continue normally.
The server asserts this flag until the client performs migration
recovery on the migrated share.

Note that servers MUST NOT return NFS4ERR_LEASE_MOVED to NFSv4
clients not using NFSv4.0.

After the server asserts any of the sr_status_flags in the SEQUENCE
operation in a typical compound, our client initiates standard lease
recovery.  For NFSv4.1+, a stand-alone SEQUENCE operation is
performed to discover what recovery is needed.

If SEQ4_STATUS_LEASE_MOVED is asserted in this stand-alone SEQUENCE
operation, our client attempts to discover which FSIDs have been
migrated, and then performs migration recovery on each.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4state.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 552706d2d776..62c08bf60e65 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2227,9 +2227,10 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
 		nfs41_handle_server_reboot(clp);
 	if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
 			    SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED |
-			    SEQ4_STATUS_ADMIN_STATE_REVOKED |
-			    SEQ4_STATUS_LEASE_MOVED))
+			    SEQ4_STATUS_ADMIN_STATE_REVOKED))
 		nfs41_handle_state_revoked(clp);
+	if (flags & SEQ4_STATUS_LEASE_MOVED)
+		nfs4_schedule_lease_moved_recovery(clp);
 	if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
 		nfs41_handle_recallable_state_revoked(clp);
 	if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT)
-- 
cgit v1.3


From cd3fadece2b97462583500371a3eb9b091c49603 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:14:04 -0400
Subject: NFS: Set EXCHGID4_FLAG_SUPP_MOVED_MIGR

Broadly speaking, v4.1 migration is untested.  There are no servers
in the wild that support NFSv4.1 migration.  However, as server
implementations become available, we do want to enable testing by
developers, while leaving it disabled for environments for which
broken migration support would be an unpleasant surprise.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/Kconfig    | 11 +++++++++++
 fs/nfs/nfs4proc.c |  8 +++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index b5e80b0af315..38c1768b4142 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -140,6 +140,17 @@ config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
 	  If the NFS client is unchanged from the upstream kernel, this
 	  option should be set to the default "kernel.org".
 
+config NFS_V4_1_MIGRATION
+	bool "NFSv4.1 client support for migration"
+	depends on NFS_V4_1
+	default n
+	help
+	  This option makes the NFS client advertise to NFSv4.1 servers that
+          it can support NFSv4 migration.
+
+          The NFSv4.1 pieces of the Linux NFSv4 migration implementation are
+          still experimental.  If you are not an NFSv4 developer, say N here.
+
 config NFS_V4_SECURITY_LABEL
 	bool
 	depends on NFS_V4_2 && SECURITY
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8aa8ff3fd5e7..1463c71b0862 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6596,8 +6596,14 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 	struct nfs41_exchange_id_args args = {
 		.verifier = &verifier,
 		.client = clp,
+#ifdef CONFIG_NFS_V4_1_MIGRATION
 		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
-			EXCHGID4_FLAG_BIND_PRINC_STATEID,
+			 EXCHGID4_FLAG_BIND_PRINC_STATEID |
+			 EXCHGID4_FLAG_SUPP_MOVED_MIGR,
+#else
+		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+			 EXCHGID4_FLAG_BIND_PRINC_STATEID,
+#endif
 	};
 	struct nfs41_exchange_id_res res = {
 		0
-- 
cgit v1.3


From 0625c2dd6ae32a221871ffc04bb752216b823001 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Oct 2013 14:14:10 -0400
Subject: NFS: Fix possible endless state recovery wait

In nfs4_wait_clnt_recover(), hold a reference to the clp being
waited on.  The state manager can reduce clp->cl_count to 1, in
which case the nfs_put_client() in nfs4_run_state_manager() can
free *clp before wait_on_bit() returns and allows
nfs4_wait_clnt_recover() to run again.

The behavior at that point is non-deterministic.  If the waited-on
bit still happens to be zero, wait_on_bit() will wake the waiter as
expected.  If the bit is set again (say, if the memory was poisoned
when freed) wait_on_bit() can leave the waiter asleep.

This is a narrow fix which ensures the safety of accessing *clp in
nfs4_wait_clnt_recover(), but does not address the continued use
of a possibly freed *clp after nfs4_wait_clnt_recover() returns
(see nfs_end_delegation_return(), for example).

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4state.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 62c08bf60e65..452f4c8dadea 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1255,14 +1255,16 @@ int nfs4_wait_clnt_recover(struct nfs_client *clp)
 
 	might_sleep();
 
+	atomic_inc(&clp->cl_count);
 	res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
 			nfs_wait_bit_killable, TASK_KILLABLE);
 	if (res)
-		return res;
-
+		goto out;
 	if (clp->cl_cons_state < 0)
-		return clp->cl_cons_state;
-	return 0;
+		res = clp->cl_cons_state;
+out:
+	nfs_put_client(clp);
+	return res;
 }
 
 int nfs4_client_recover_expired_lease(struct nfs_client *clp)
-- 
cgit v1.3


From 47fd88e6b79c55e6acccaf832078ed1a340672fa Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Fri, 18 Oct 2013 15:15:15 -0400
Subject: NFSv4: make nfs_find_best_sec static

It's not used outside of nfs4namespace.c anymore.

Signed-off-by: Weston Andros Adamson <dros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h       | 1 -
 fs/nfs/nfs4namespace.c | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 210e44ed1d2a..3ce79b04522e 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -222,7 +222,6 @@ int nfs_atomic_open(struct inode *, struct dentry *, struct file *,
 extern struct file_system_type nfs4_fs_type;
 
 /* nfs4namespace.c */
-rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *);
 struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *, struct inode *, struct qstr *);
 struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *,
 			       struct nfs_fh *, struct nfs_fattr *);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index ebd8b064843b..4ec0eea68fdf 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -145,7 +145,7 @@ static size_t nfs_parse_server_name(char *string, size_t len,
  * is searched in the order returned from the server, per RFC 3530
  * recommendation.
  */
-rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
+static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
 {
 	rpc_authflavor_t pseudoflavor;
 	struct nfs4_secinfo4 *secinfo;
-- 
cgit v1.3


From a3f73c27afff9590a4432879b7145289cb89cf0a Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Fri, 18 Oct 2013 15:15:16 -0400
Subject: NFS: separate passed security flavs from selected

When filling parsed_mount_data, store the parsed sec= mount option in
the new struct nfs_auth_info and the chosen flavor in selected_flavor.

This patch lays the groundwork for supporting multiple sec= options.

Signed-off-by: Weston Andros Adamson <dros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c         |  3 ++-
 fs/nfs/internal.h       |  4 ++--
 fs/nfs/nfs4client.c     | 16 +++++++++-------
 fs/nfs/super.c          | 47 ++++++++++++++++++++++-------------------------
 include/linux/nfs_xdr.h |  6 ++++++
 5 files changed, 41 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 692fd0e9362f..f5a7f7f9cd59 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -787,7 +787,8 @@ static int nfs_init_server(struct nfs_server *server,
 
 	server->port = data->nfs_server.port;
 
-	error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]);
+	error = nfs_init_server_rpcclient(server, &timeparms,
+					  data->selected_flavor);
 	if (error < 0)
 		goto error;
 
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e5a6bd12562a..c8cd044f0982 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -88,8 +88,8 @@ struct nfs_parsed_mount_data {
 	unsigned int		namlen;
 	unsigned int		options;
 	unsigned int		bsize;
-	unsigned int		auth_flavor_len;
-	rpc_authflavor_t	auth_flavors[1];
+	struct nfs_auth_info	auth_info;
+	rpc_authflavor_t	selected_flavor;
 	char			*client_address;
 	unsigned int		version;
 	unsigned int		minorversion;
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 0cde95e9bd6a..d65090e5d00f 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -949,9 +949,8 @@ out:
  * Create a version 4 volume record
  */
 static int nfs4_init_server(struct nfs_server *server,
-		const struct nfs_parsed_mount_data *data)
+		struct nfs_parsed_mount_data *data)
 {
-	rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX;
 	struct rpc_timeout timeparms;
 	int error;
 
@@ -964,8 +963,10 @@ static int nfs4_init_server(struct nfs_server *server,
 	server->flags = data->flags;
 	server->options = data->options;
 
-	if (data->auth_flavor_len >= 1)
-		pseudoflavor = data->auth_flavors[0];
+	if (data->auth_info.flavor_len >= 1)
+		data->selected_flavor = data->auth_info.flavors[0];
+	else
+		data->selected_flavor = RPC_AUTH_UNIX;
 
 	/* Get a client record */
 	error = nfs4_set_client(server,
@@ -973,7 +974,7 @@ static int nfs4_init_server(struct nfs_server *server,
 			(const struct sockaddr *)&data->nfs_server.address,
 			data->nfs_server.addrlen,
 			data->client_address,
-			pseudoflavor,
+			data->selected_flavor,
 			data->nfs_server.protocol,
 			&timeparms,
 			data->minorversion,
@@ -993,7 +994,8 @@ static int nfs4_init_server(struct nfs_server *server,
 
 	server->port = data->nfs_server.port;
 
-	error = nfs_init_server_rpcclient(server, &timeparms, pseudoflavor);
+	error = nfs_init_server_rpcclient(server, &timeparms,
+					  data->selected_flavor);
 
 error:
 	/* Done */
@@ -1020,7 +1022,7 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info,
 	if (!server)
 		return ERR_PTR(-ENOMEM);
 
-	auth_probe = mount_info->parsed->auth_flavor_len < 1;
+	auth_probe = mount_info->parsed->auth_info.flavor_len < 1;
 
 	/* set up the general RPC client */
 	error = nfs4_init_server(server, mount_info->parsed);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e26647be69ce..b87744fb9dcd 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -923,8 +923,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void)
 		data->mount_server.port	= NFS_UNSPEC_PORT;
 		data->nfs_server.port	= NFS_UNSPEC_PORT;
 		data->nfs_server.protocol = XPRT_TRANSPORT_TCP;
-		data->auth_flavors[0]	= RPC_AUTH_MAXFLAVOR;
-		data->auth_flavor_len	= 0;
+		data->selected_flavor	= RPC_AUTH_MAXFLAVOR;
 		data->minorversion	= 0;
 		data->need_mount	= true;
 		data->net		= current->nsproxy->net_ns;
@@ -1019,13 +1018,6 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
 	}
 }
 
-static void nfs_set_auth_parsed_mount_data(struct nfs_parsed_mount_data *data,
-		rpc_authflavor_t pseudoflavor)
-{
-	data->auth_flavors[0] = pseudoflavor;
-	data->auth_flavor_len = 1;
-}
-
 /*
  * Parse the value of the 'sec=' option.
  */
@@ -1076,7 +1068,8 @@ static int nfs_parse_security_flavors(char *value,
 	}
 
 	mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-	nfs_set_auth_parsed_mount_data(mnt, pseudoflavor);
+	mnt->auth_info.flavors[0] = pseudoflavor;
+	mnt->auth_info.flavor_len = 1;
 	return 1;
 }
 
@@ -1623,7 +1616,7 @@ out_security_failure:
 }
 
 /*
- * Ensure that the specified authtype in args->auth_flavors[0] is supported by
+ * Ensure that the specified authtype in args->auth_info is supported by
  * the server. Returns 0 if it's ok, and -EACCES if not.
  */
 static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args,
@@ -1640,17 +1633,18 @@ static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args,
 	 * can be used.
 	 */
 	for (i = 0; i < count; i++) {
-		if (args->auth_flavors[0] == server_authlist[i] ||
+		if (args->auth_info.flavors[0] == server_authlist[i] ||
 		    server_authlist[i] == RPC_AUTH_NULL)
 			goto out;
 	}
 
 	dfprintk(MOUNT, "NFS: auth flavor %u not supported by server\n",
-		args->auth_flavors[0]);
+		args->auth_info.flavors[0]);
 	return -EACCES;
 
 out:
-	dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
+	args->selected_flavor = args->auth_info.flavors[0];
+	dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->selected_flavor);
 	return 0;
 }
 
@@ -1738,9 +1732,10 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf
 	 * Was a sec= authflavor specified in the options? First, verify
 	 * whether the server supports it, and then just try to use it if so.
 	 */
-	if (args->auth_flavor_len > 0) {
+	if (args->auth_info.flavor_len > 0) {
 		status = nfs_verify_authflavor(args, authlist, authlist_len);
-		dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
+		dfprintk(MOUNT, "NFS: using auth flavor %u\n",
+			 args->selected_flavor);
 		if (status)
 			return ERR_PTR(status);
 		return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
@@ -1769,7 +1764,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf
 			/* Fallthrough */
 		}
 		dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor);
-		nfs_set_auth_parsed_mount_data(args, flavor);
+		args->selected_flavor = flavor;
 		server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
 		if (!IS_ERR(server))
 			return server;
@@ -1785,7 +1780,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf
 
 	/* Last chance! Try AUTH_UNIX */
 	dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX);
-	nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX);
+	args->selected_flavor = RPC_AUTH_UNIX;
 	return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
 }
 
@@ -1972,9 +1967,9 @@ static int nfs23_validate_mount_data(void *options,
 		args->bsize		= data->bsize;
 
 		if (data->flags & NFS_MOUNT_SECFLAVOUR)
-			nfs_set_auth_parsed_mount_data(args, data->pseudoflavor);
+			args->selected_flavor = data->pseudoflavor;
 		else
-			nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX);
+			args->selected_flavor = RPC_AUTH_UNIX;
 		if (!args->nfs_server.hostname)
 			goto out_nomem;
 
@@ -2108,7 +2103,7 @@ static int nfs_validate_text_mount_data(void *options,
 
 	nfs_set_port(sap, &args->nfs_server.port, port);
 
-	if (args->auth_flavor_len > 1)
+	if (args->auth_info.flavor_len > 1)
 		goto out_bad_auth;
 
 	return nfs_parse_devname(dev_name,
@@ -2146,7 +2141,7 @@ nfs_compare_remount_data(struct nfs_server *nfss,
 	    data->version != nfss->nfs_client->rpc_ops->version ||
 	    data->minorversion != nfss->nfs_client->cl_minorversion ||
 	    data->retrans != nfss->client->cl_timeout->to_retries ||
-	    data->auth_flavors[0] != nfss->client->cl_auth->au_flavor ||
+	    data->selected_flavor != nfss->client->cl_auth->au_flavor ||
 	    data->acregmin != nfss->acregmin / HZ ||
 	    data->acregmax != nfss->acregmax / HZ ||
 	    data->acdirmin != nfss->acdirmin / HZ ||
@@ -2191,7 +2186,9 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	data->rsize = nfss->rsize;
 	data->wsize = nfss->wsize;
 	data->retrans = nfss->client->cl_timeout->to_retries;
-	nfs_set_auth_parsed_mount_data(data, nfss->client->cl_auth->au_flavor);
+	data->selected_flavor = nfss->client->cl_auth->au_flavor;
+	data->auth_info.flavors[0] = nfss->client->cl_auth->au_flavor;
+	data->auth_info.flavor_len = 1;
 	data->acregmin = nfss->acregmin / HZ;
 	data->acregmax = nfss->acregmax / HZ;
 	data->acdirmin = nfss->acdirmin / HZ;
@@ -2718,9 +2715,9 @@ static int nfs4_validate_mount_data(void *options,
 					   data->auth_flavours,
 					   sizeof(pseudoflavor)))
 				return -EFAULT;
-			nfs_set_auth_parsed_mount_data(args, pseudoflavor);
+			args->selected_flavor = pseudoflavor;
 		} else
-			nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX);
+			args->selected_flavor = RPC_AUTH_UNIX;
 
 		c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
 		if (IS_ERR(c))
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 8fe5b940c5f2..658104acf13b 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -591,6 +591,12 @@ struct nfs_renameres {
 	struct nfs_fattr		*new_fattr;
 };
 
+/* parsed sec= options */
+struct nfs_auth_info {
+	unsigned int            flavor_len;
+	rpc_authflavor_t        flavors[1];
+};
+
 /*
  * Argument struct for decode_entry function
  */
-- 
cgit v1.3


From 0f5f49b8b3593309fd3c3a2080a5fd465afdbe16 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Fri, 18 Oct 2013 15:15:17 -0400
Subject: NFS: cache parsed auth_info in nfs_server

Cache the auth_info structure in nfs_server and pass these values to submounts.

This lays the groundwork for supporting multiple sec= options.

Signed-off-by: Weston Andros Adamson <dros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           | 2 ++
 fs/nfs/nfs4client.c       | 1 +
 fs/nfs/super.c            | 3 +--
 include/linux/nfs_fs_sb.h | 1 +
 4 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index f5a7f7f9cd59..1d09289c8f0e 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -786,6 +786,7 @@ static int nfs_init_server(struct nfs_server *server,
 		goto error;
 
 	server->port = data->nfs_server.port;
+	server->auth_info = data->auth_info;
 
 	error = nfs_init_server_rpcclient(server, &timeparms,
 					  data->selected_flavor);
@@ -929,6 +930,7 @@ void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *sour
 	target->acdirmax = source->acdirmax;
 	target->caps = source->caps;
 	target->options = source->options;
+	target->auth_info = source->auth_info;
 }
 EXPORT_SYMBOL_GPL(nfs_server_copy_userdata);
 
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index d65090e5d00f..04131c837f27 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -962,6 +962,7 @@ static int nfs4_init_server(struct nfs_server *server,
 	/* Initialise the client representation from the mount data */
 	server->flags = data->flags;
 	server->options = data->options;
+	server->auth_info = data->auth_info;
 
 	if (data->auth_info.flavor_len >= 1)
 		data->selected_flavor = data->auth_info.flavors[0];
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b87744fb9dcd..de1e5e89c93d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2187,8 +2187,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	data->wsize = nfss->wsize;
 	data->retrans = nfss->client->cl_timeout->to_retries;
 	data->selected_flavor = nfss->client->cl_auth->au_flavor;
-	data->auth_info.flavors[0] = nfss->client->cl_auth->au_flavor;
-	data->auth_info.flavor_len = 1;
+	data->auth_info = nfss->auth_info;
 	data->acregmin = nfss->acregmin / HZ;
 	data->acregmax = nfss->acregmax / HZ;
 	data->acdirmin = nfss->acdirmin / HZ;
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 186ec4969313..1150ea41b626 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -151,6 +151,7 @@ struct nfs_server {
 	unsigned long		mount_time;	/* when this fs was mounted */
 	struct super_block	*super;		/* VFS super block */
 	dev_t			s_dev;		/* superblock dev numbers */
+	struct nfs_auth_info	auth_info;	/* parsed auth flavors */
 
 #ifdef CONFIG_NFS_FSCACHE
 	struct nfs_fscache_key	*fscache_key;	/* unique key for superblock */
-- 
cgit v1.3


From 5837f6dfcb00f764976ddc178933e612702cbf54 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Fri, 18 Oct 2013 15:15:18 -0400
Subject: NFS: stop using NFS_MOUNT_SECFLAVOUR server flag

Since the parsed sec= flavor is now stored in nfs_server->auth_info,
we no longer need an nfs_server flag to determine if a sec= option was
used.

This flag has not been completely removed because it is still needed for
the (old but still supported) non-text parsed mount options ABI
compatability.

Signed-off-by: Weston Andros Adamson <dros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4client.c            | 6 ++++--
 fs/nfs/nfs4namespace.c         | 2 +-
 fs/nfs/nfs4proc.c              | 4 ++--
 fs/nfs/super.c                 | 3 +--
 include/uapi/linux/nfs_mount.h | 2 +-
 5 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 04131c837f27..f6cc77c7d802 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -1051,6 +1051,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 {
 	struct nfs_client *parent_client;
 	struct nfs_server *server, *parent_server;
+	bool auth_probe;
 	int error;
 
 	dprintk("--> nfs4_create_referral_server()\n");
@@ -1083,8 +1084,9 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 	if (error < 0)
 		goto error;
 
-	error = nfs4_server_common_setup(server, mntfh,
-			!(parent_server->flags & NFS_MOUNT_SECFLAVOUR));
+	auth_probe = parent_server->auth_info.flavor_len < 1;
+
+	error = nfs4_server_common_setup(server, mntfh, auth_probe);
 	if (error < 0)
 		goto error;
 
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 4ec0eea68fdf..b947054f0ca1 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -390,7 +390,7 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry,
 
 	if (client->cl_auth->au_flavor != flavor)
 		flavor = client->cl_auth->au_flavor;
-	else if (!(server->flags & NFS_MOUNT_SECFLAVOUR)) {
+	else if (server->auth_info.flavor_len == 0) {
 		rpc_authflavor_t new = nfs4_negotiate_security(dir, name);
 		if ((int)new >= 0)
 			flavor = new;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1463c71b0862..0eb8dc5792da 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2920,7 +2920,7 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
 		if (status != -NFS4ERR_WRONGSEC)
 			break;
 		/* Did user force a 'sec=' mount option? */
-		if (server->flags & NFS_MOUNT_SECFLAVOUR)
+		if (server->auth_info.flavor_len > 0)
 			break;
 	default:
 		status = nfs4_do_find_root_sec(server, fhandle, info);
@@ -3180,7 +3180,7 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
 			if (client != *clnt)
 				goto out;
 			/* No security negotiation if the user specified 'sec=' */
-			if (NFS_SERVER(dir)->flags & NFS_MOUNT_SECFLAVOUR)
+			if (NFS_SERVER(dir)->auth_info.flavor_len > 0)
 				goto out;
 			client = nfs4_create_sec_client(client, dir, name);
 			if (IS_ERR(client))
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index de1e5e89c93d..3a4f8bf5e5a5 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1067,7 +1067,6 @@ static int nfs_parse_security_flavors(char *value,
 		return 0;
 	}
 
-	mnt->flags |= NFS_MOUNT_SECFLAVOUR;
 	mnt->auth_info.flavors[0] = pseudoflavor;
 	mnt->auth_info.flavor_len = 1;
 	return 1;
@@ -2332,7 +2331,7 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n
 		goto Ebusy;
 	if (a->acdirmax != b->acdirmax)
 		goto Ebusy;
-	if (b->flags & NFS_MOUNT_SECFLAVOUR &&
+	if (b->auth_info.flavor_len > 0 &&
 	   clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
 		goto Ebusy;
 	return 1;
diff --git a/include/uapi/linux/nfs_mount.h b/include/uapi/linux/nfs_mount.h
index 576bddd72e04..64b0f22f5c4c 100644
--- a/include/uapi/linux/nfs_mount.h
+++ b/include/uapi/linux/nfs_mount.h
@@ -60,7 +60,7 @@ struct nfs_mount_data {
 #define NFS_MOUNT_BROKEN_SUID	0x0400	/* 4 */
 #define NFS_MOUNT_NOACL		0x0800	/* 4 */
 #define NFS_MOUNT_STRICTLOCK	0x1000	/* reserved for NFSv4 */
-#define NFS_MOUNT_SECFLAVOUR	0x2000	/* 5 */
+#define NFS_MOUNT_SECFLAVOUR	0x2000	/* 5 non-text parsed mount data only */
 #define NFS_MOUNT_NORDIRPLUS	0x4000	/* 5 */
 #define NFS_MOUNT_UNSHARED	0x8000	/* 5 */
 #define NFS_MOUNT_FLAGMASK	0xFFFF
-- 
cgit v1.3


From 4d4b69dd847a098cdca341c45326f6c6f61b8691 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Fri, 18 Oct 2013 15:15:19 -0400
Subject: NFS: add support for multiple sec= mount options

This patch adds support for multiple security options which can be
specified using a colon-delimited list of security flavors (the same
syntax as nfsd's exports file).

This is useful, for instance, when NFSv4.x mounts cross SECINFO
boundaries. With this patch a user can use "sec=krb5i,krb5p"
to mount a remote filesystem using krb5i, but can still cross
into krb5p-only exports.

New mounts will try all security options before failing.  NFSv4.x
SECINFO results will be compared against the sec= flavors to
find the first flavor in both lists or if no match is found will
return -EPERM.

Signed-off-by: Weston Andros Adamson <dros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h       |   1 +
 fs/nfs/nfs4client.c     |   3 +
 fs/nfs/nfs4namespace.c  |  17 +++--
 fs/nfs/nfs4proc.c       |  32 ++++++----
 fs/nfs/super.c          | 160 +++++++++++++++++++++++++++++++-----------------
 include/linux/nfs_xdr.h |   3 +-
 6 files changed, 145 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index c8cd044f0982..bca6a3e3c49c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -326,6 +326,7 @@ extern struct file_system_type nfs_xdev_fs_type;
 extern struct file_system_type nfs4_xdev_fs_type;
 extern struct file_system_type nfs4_referral_fs_type;
 #endif
+bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t);
 struct dentry *nfs_try_mount(int, const char *, struct nfs_mount_info *,
 			struct nfs_subversion *);
 void nfs_initialise_sb(struct super_block *);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index f6cc77c7d802..b4a160a405ce 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -964,6 +964,9 @@ static int nfs4_init_server(struct nfs_server *server,
 	server->options = data->options;
 	server->auth_info = data->auth_info;
 
+	/* Use the first specified auth flavor. If this flavor isn't
+	 * allowed by the server, use the SECINFO path to try the
+	 * other specified flavors */
 	if (data->auth_info.flavor_len >= 1)
 		data->selected_flavor = data->auth_info.flavors[0];
 	else
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index b947054f0ca1..c08cbf40c59e 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -137,6 +137,7 @@ static size_t nfs_parse_server_name(char *string, size_t len,
 
 /**
  * nfs_find_best_sec - Find a security mechanism supported locally
+ * @server: NFS server struct
  * @flavors: List of security tuples returned by SECINFO procedure
  *
  * Return the pseudoflavor of the first security mechanism in
@@ -145,7 +146,8 @@ static size_t nfs_parse_server_name(char *string, size_t len,
  * is searched in the order returned from the server, per RFC 3530
  * recommendation.
  */
-static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
+static rpc_authflavor_t nfs_find_best_sec(struct nfs_server *server,
+					  struct nfs4_secinfo_flavors *flavors)
 {
 	rpc_authflavor_t pseudoflavor;
 	struct nfs4_secinfo4 *secinfo;
@@ -160,12 +162,19 @@ static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
 		case RPC_AUTH_GSS:
 			pseudoflavor = rpcauth_get_pseudoflavor(secinfo->flavor,
 							&secinfo->flavor_info);
-			if (pseudoflavor != RPC_AUTH_MAXFLAVOR)
+			/* make sure pseudoflavor matches sec= mount opt */
+			if (pseudoflavor != RPC_AUTH_MAXFLAVOR &&
+			    nfs_auth_info_match(&server->auth_info,
+						pseudoflavor))
 				return pseudoflavor;
 			break;
 		}
 	}
 
+	/* if there were any sec= options then nothing matched */
+	if (server->auth_info.flavor_len > 0)
+		return -EPERM;
+
 	return RPC_AUTH_UNIX;
 }
 
@@ -187,7 +196,7 @@ static rpc_authflavor_t nfs4_negotiate_security(struct inode *inode, struct qstr
 		goto out;
 	}
 
-	flavor = nfs_find_best_sec(flavors);
+	flavor = nfs_find_best_sec(NFS_SERVER(inode), flavors);
 
 out:
 	put_page(page);
@@ -390,7 +399,7 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry,
 
 	if (client->cl_auth->au_flavor != flavor)
 		flavor = client->cl_auth->au_flavor;
-	else if (server->auth_info.flavor_len == 0) {
+	else {
 		rpc_authflavor_t new = nfs4_negotiate_security(dir, name);
 		if ((int)new >= 0)
 			flavor = new;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0eb8dc5792da..b02c4cc7b0a9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2873,11 +2873,24 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
 	int status = -EPERM;
 	size_t i;
 
-	for (i = 0; i < ARRAY_SIZE(flav_array); i++) {
-		status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]);
-		if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
-			continue;
-		break;
+	if (server->auth_info.flavor_len > 0) {
+		/* try each flavor specified by user */
+		for (i = 0; i < server->auth_info.flavor_len; i++) {
+			status = nfs4_lookup_root_sec(server, fhandle, info,
+						server->auth_info.flavors[i]);
+			if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
+				continue;
+			break;
+		}
+	} else {
+		/* no flavors specified by user, try default list */
+		for (i = 0; i < ARRAY_SIZE(flav_array); i++) {
+			status = nfs4_lookup_root_sec(server, fhandle, info,
+						      flav_array[i]);
+			if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
+				continue;
+			break;
+		}
 	}
 
 	/*
@@ -2919,9 +2932,6 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
 		status = nfs4_lookup_root(server, fhandle, info);
 		if (status != -NFS4ERR_WRONGSEC)
 			break;
-		/* Did user force a 'sec=' mount option? */
-		if (server->auth_info.flavor_len > 0)
-			break;
 	default:
 		status = nfs4_do_find_root_sec(server, fhandle, info);
 	}
@@ -3179,9 +3189,6 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
 			err = -EPERM;
 			if (client != *clnt)
 				goto out;
-			/* No security negotiation if the user specified 'sec=' */
-			if (NFS_SERVER(dir)->auth_info.flavor_len > 0)
-				goto out;
 			client = nfs4_create_sec_client(client, dir, name);
 			if (IS_ERR(client))
 				return PTR_ERR(client);
@@ -7942,6 +7949,9 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
 			break;
 		}
 
+		if (!nfs_auth_info_match(&server->auth_info, flavor))
+			flavor = RPC_AUTH_MAXFLAVOR;
+
 		if (flavor != RPC_AUTH_MAXFLAVOR) {
 			err = nfs4_lookup_root_sec(server, fhandle,
 						   info, flavor);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 3a4f8bf5e5a5..317d6fc2160e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -497,7 +497,8 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)
 	static const struct {
 		rpc_authflavor_t flavour;
 		const char *str;
-	} sec_flavours[] = {
+	} sec_flavours[NFS_AUTH_INFO_MAX_FLAVORS] = {
+		/* update NFS_AUTH_INFO_MAX_FLAVORS when this list changes! */
 		{ RPC_AUTH_NULL, "null" },
 		{ RPC_AUTH_UNIX, "sys" },
 		{ RPC_AUTH_GSS_KRB5, "krb5" },
@@ -1018,6 +1019,52 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
 	}
 }
 
+/*
+ * Add 'flavor' to 'auth_info' if not already present.
+ * Returns true if 'flavor' ends up in the list, false otherwise
+ */
+static bool nfs_auth_info_add(struct nfs_auth_info *auth_info,
+			      rpc_authflavor_t flavor)
+{
+	unsigned int i;
+	unsigned int max_flavor_len = (sizeof(auth_info->flavors) /
+				       sizeof(auth_info->flavors[0]));
+
+	/* make sure this flavor isn't already in the list */
+	for (i = 0; i < auth_info->flavor_len; i++) {
+		if (flavor == auth_info->flavors[i])
+			return true;
+	}
+
+	if (auth_info->flavor_len + 1 >= max_flavor_len) {
+		dfprintk(MOUNT, "NFS: too many sec= flavors\n");
+		return false;
+	}
+
+	auth_info->flavors[auth_info->flavor_len++] = flavor;
+	return true;
+}
+
+/*
+ * Return true if 'match' is in auth_info or auth_info is empty.
+ * Return false otherwise.
+ */
+bool nfs_auth_info_match(const struct nfs_auth_info *auth_info,
+			 rpc_authflavor_t match)
+{
+	int i;
+
+	if (!auth_info->flavor_len)
+		return true;
+
+	for (i = 0; i < auth_info->flavor_len; i++) {
+		if (auth_info->flavors[i] == match)
+			return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(nfs_auth_info_match);
+
 /*
  * Parse the value of the 'sec=' option.
  */
@@ -1026,49 +1073,55 @@ static int nfs_parse_security_flavors(char *value,
 {
 	substring_t args[MAX_OPT_ARGS];
 	rpc_authflavor_t pseudoflavor;
+	char *p;
 
 	dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value);
 
-	switch (match_token(value, nfs_secflavor_tokens, args)) {
-	case Opt_sec_none:
-		pseudoflavor = RPC_AUTH_NULL;
-		break;
-	case Opt_sec_sys:
-		pseudoflavor = RPC_AUTH_UNIX;
-		break;
-	case Opt_sec_krb5:
-		pseudoflavor = RPC_AUTH_GSS_KRB5;
-		break;
-	case Opt_sec_krb5i:
-		pseudoflavor = RPC_AUTH_GSS_KRB5I;
-		break;
-	case Opt_sec_krb5p:
-		pseudoflavor = RPC_AUTH_GSS_KRB5P;
-		break;
-	case Opt_sec_lkey:
-		pseudoflavor = RPC_AUTH_GSS_LKEY;
-		break;
-	case Opt_sec_lkeyi:
-		pseudoflavor = RPC_AUTH_GSS_LKEYI;
-		break;
-	case Opt_sec_lkeyp:
-		pseudoflavor = RPC_AUTH_GSS_LKEYP;
-		break;
-	case Opt_sec_spkm:
-		pseudoflavor = RPC_AUTH_GSS_SPKM;
-		break;
-	case Opt_sec_spkmi:
-		pseudoflavor = RPC_AUTH_GSS_SPKMI;
-		break;
-	case Opt_sec_spkmp:
-		pseudoflavor = RPC_AUTH_GSS_SPKMP;
-		break;
-	default:
-		return 0;
+	while ((p = strsep(&value, ":")) != NULL) {
+		switch (match_token(p, nfs_secflavor_tokens, args)) {
+		case Opt_sec_none:
+			pseudoflavor = RPC_AUTH_NULL;
+			break;
+		case Opt_sec_sys:
+			pseudoflavor = RPC_AUTH_UNIX;
+			break;
+		case Opt_sec_krb5:
+			pseudoflavor = RPC_AUTH_GSS_KRB5;
+			break;
+		case Opt_sec_krb5i:
+			pseudoflavor = RPC_AUTH_GSS_KRB5I;
+			break;
+		case Opt_sec_krb5p:
+			pseudoflavor = RPC_AUTH_GSS_KRB5P;
+			break;
+		case Opt_sec_lkey:
+			pseudoflavor = RPC_AUTH_GSS_LKEY;
+			break;
+		case Opt_sec_lkeyi:
+			pseudoflavor = RPC_AUTH_GSS_LKEYI;
+			break;
+		case Opt_sec_lkeyp:
+			pseudoflavor = RPC_AUTH_GSS_LKEYP;
+			break;
+		case Opt_sec_spkm:
+			pseudoflavor = RPC_AUTH_GSS_SPKM;
+			break;
+		case Opt_sec_spkmi:
+			pseudoflavor = RPC_AUTH_GSS_SPKMI;
+			break;
+		case Opt_sec_spkmp:
+			pseudoflavor = RPC_AUTH_GSS_SPKMP;
+			break;
+		default:
+			dfprintk(MOUNT,
+				 "NFS: sec= option '%s' not recognized\n", p);
+			return 0;
+		}
+
+		if (!nfs_auth_info_add(&mnt->auth_info, pseudoflavor))
+			return 0;
 	}
 
-	mnt->auth_info.flavors[0] = pseudoflavor;
-	mnt->auth_info.flavor_len = 1;
 	return 1;
 }
 
@@ -1615,12 +1668,14 @@ out_security_failure:
 }
 
 /*
- * Ensure that the specified authtype in args->auth_info is supported by
- * the server. Returns 0 if it's ok, and -EACCES if not.
+ * Ensure that a specified authtype in args->auth_info is supported by
+ * the server. Returns 0 and sets args->selected_flavor if it's ok, and
+ * -EACCES if not.
  */
-static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args,
+static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
 			rpc_authflavor_t *server_authlist, unsigned int count)
 {
+	rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;
 	unsigned int i;
 
 	/*
@@ -1632,17 +1687,19 @@ static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args,
 	 * can be used.
 	 */
 	for (i = 0; i < count; i++) {
-		if (args->auth_info.flavors[0] == server_authlist[i] ||
-		    server_authlist[i] == RPC_AUTH_NULL)
+		flavor = server_authlist[i];
+
+		if (nfs_auth_info_match(&args->auth_info, flavor) ||
+		    flavor == RPC_AUTH_NULL)
 			goto out;
 	}
 
-	dfprintk(MOUNT, "NFS: auth flavor %u not supported by server\n",
-		args->auth_info.flavors[0]);
+	dfprintk(MOUNT,
+		 "NFS: specified auth flavors not supported by server\n");
 	return -EACCES;
 
 out:
-	args->selected_flavor = args->auth_info.flavors[0];
+	args->selected_flavor = flavor;
 	dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->selected_flavor);
 	return 0;
 }
@@ -1732,7 +1789,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf
 	 * whether the server supports it, and then just try to use it if so.
 	 */
 	if (args->auth_info.flavor_len > 0) {
-		status = nfs_verify_authflavor(args, authlist, authlist_len);
+		status = nfs_verify_authflavors(args, authlist, authlist_len);
 		dfprintk(MOUNT, "NFS: using auth flavor %u\n",
 			 args->selected_flavor);
 		if (status)
@@ -2102,9 +2159,6 @@ static int nfs_validate_text_mount_data(void *options,
 
 	nfs_set_port(sap, &args->nfs_server.port, port);
 
-	if (args->auth_info.flavor_len > 1)
-		goto out_bad_auth;
-
 	return nfs_parse_devname(dev_name,
 				   &args->nfs_server.hostname,
 				   max_namelen,
@@ -2124,10 +2178,6 @@ out_invalid_transport_udp:
 out_no_address:
 	dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n");
 	return -EINVAL;
-
-out_bad_auth:
-	dfprintk(MOUNT, "NFS: Too many RPC auth flavours specified\n");
-	return -EINVAL;
 }
 
 static int
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 658104acf13b..3ccfcecf8999 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -592,9 +592,10 @@ struct nfs_renameres {
 };
 
 /* parsed sec= options */
+#define NFS_AUTH_INFO_MAX_FLAVORS 12 /* see fs/nfs/super.c */
 struct nfs_auth_info {
 	unsigned int            flavor_len;
-	rpc_authflavor_t        flavors[1];
+	rpc_authflavor_t        flavors[NFS_AUTH_INFO_MAX_FLAVORS];
 };
 
 /*
-- 
cgit v1.3


From 9857df815f17417be116f800636f460e247afb5a Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@primarydata.com>
Date: Mon, 14 Oct 2013 13:44:52 +0300
Subject: nfsd: nfs4_free_stid

Make it symmetric to nfs4_alloc_stid.

Signed-off-by: Benny Halevy <bhalevy@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a601fd49f997..9cb1b17014f5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -402,11 +402,16 @@ static void remove_stid(struct nfs4_stid *s)
 	idr_remove(stateids, s->sc_stateid.si_opaque.so_id);
 }
 
+static void nfs4_free_stid(struct kmem_cache *slab, struct nfs4_stid *s)
+{
+	kmem_cache_free(slab, s);
+}
+
 void
 nfs4_put_delegation(struct nfs4_delegation *dp)
 {
 	if (atomic_dec_and_test(&dp->dl_count)) {
-		kmem_cache_free(deleg_slab, dp);
+		nfs4_free_stid(deleg_slab, &dp->dl_stid);
 		num_delegations--;
 	}
 }
@@ -610,7 +615,7 @@ static void close_generic_stateid(struct nfs4_ol_stateid *stp)
 static void free_generic_stateid(struct nfs4_ol_stateid *stp)
 {
 	remove_stid(&stp->st_stid);
-	kmem_cache_free(stateid_slab, stp);
+	nfs4_free_stid(stateid_slab, &stp->st_stid);
 }
 
 static void release_lock_stateid(struct nfs4_ol_stateid *stp)
-- 
cgit v1.3


From 5d7dab83e3e55e81a88c406da71afde0ebaa6ad8 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@primarydata.com>
Date: Tue, 15 Oct 2013 09:57:55 +0300
Subject: nfsd: nfs4_open_delegation needs to remove_stid rather than
 unhash_stid

In the out_free: path, the newly allocated stid must be removed rather
than unhashed so it can never be found.

Signed-off-by: Benny Halevy <bhalevy@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9cb1b17014f5..3feb38e818ab 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3159,7 +3159,7 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
 	open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
 	return;
 out_free:
-	unhash_stid(&dp->dl_stid);
+	remove_stid(&dp->dl_stid);
 	nfs4_put_delegation(dp);
 out_no_deleg:
 	open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE;
-- 
cgit v1.3


From 7ebe40f20372688a627ad6c754bc0d1c05df58a9 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@primarydata.com>
Date: Tue, 15 Oct 2013 09:58:00 +0300
Subject: nfsd: remove_stid can be incorporated into nfs4_put_delegation

All calls to nfs4_put_delegation are preceded with remove_stid.

Signed-off-by: Benny Halevy <bhalevy@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3feb38e818ab..77dfc5a6a011 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -410,6 +410,7 @@ static void nfs4_free_stid(struct kmem_cache *slab, struct nfs4_stid *s)
 void
 nfs4_put_delegation(struct nfs4_delegation *dp)
 {
+	remove_stid(&dp->dl_stid);
 	if (atomic_dec_and_test(&dp->dl_count)) {
 		nfs4_free_stid(deleg_slab, &dp->dl_stid);
 		num_delegations--;
@@ -450,14 +451,12 @@ unhash_delegation(struct nfs4_delegation *dp)
 static void destroy_revoked_delegation(struct nfs4_delegation *dp)
 {
 	list_del_init(&dp->dl_recall_lru);
-	remove_stid(&dp->dl_stid);
 	nfs4_put_delegation(dp);
 }
 
 static void destroy_delegation(struct nfs4_delegation *dp)
 {
 	unhash_delegation(dp);
-	remove_stid(&dp->dl_stid);
 	nfs4_put_delegation(dp);
 }
 
@@ -3159,7 +3158,6 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
 	open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
 	return;
 out_free:
-	remove_stid(&dp->dl_stid);
 	nfs4_put_delegation(dp);
 out_no_deleg:
 	open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE;
-- 
cgit v1.3


From 4f5829d72636513e23a5f76355536776bf390a8a Mon Sep 17 00:00:00 2001
From: "Geyslan G. Bem" <geyslan@gmail.com>
Date: Fri, 11 Oct 2013 17:15:54 -0300
Subject: nfs: Remove useless 'error' assignment

the 'error' variable was been assigned twice in vain.

Signed-off-by: Geyslan G. Bem <geyslan@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/unlink.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index bb939edd4c99..0c29b1bb3936 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -493,7 +493,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 	unsigned long long fileid;
 	struct dentry *sdentry;
 	struct rpc_task *task;
-	int            error = -EIO;
+	int            error = -EBUSY;
 
 	dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -503,7 +503,6 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 	/*
 	 * We don't allow a dentry to be silly-renamed twice.
 	 */
-	error = -EBUSY;
 	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
 		goto out;
 
-- 
cgit v1.3


From 54bcfa6682cfc7ee19dee0e94587f8f55a440df6 Mon Sep 17 00:00:00 2001
From: "Geyslan G. Bem" <geyslan@gmail.com>
Date: Mon, 14 Oct 2013 17:24:15 -0300
Subject: nfs: Use PTR_ERR_OR_ZERO in 'nfs41_callback_up' function

Use 'PTR_ERR_OR_ZERO()' rather than 'IS_ERR(...) ? PTR_ERR(...) : 0'.

Signed-off-by: Geyslan G. Bem <geyslan@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 67cd73213168..073b4cf67ed9 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -164,8 +164,7 @@ nfs41_callback_up(struct svc_serv *serv)
 		svc_xprt_put(serv->sv_bc_xprt);
 		serv->sv_bc_xprt = NULL;
 	}
-	dprintk("--> %s return %ld\n", __func__,
-		IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0);
+	dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp));
 	return rqstp;
 }
 
-- 
cgit v1.3


From 6706246b22358b70b49e35a988a9bbfa830619ee Mon Sep 17 00:00:00 2001
From: "Geyslan G. Bem" <geyslan@gmail.com>
Date: Mon, 14 Oct 2013 17:24:16 -0300
Subject: nfs: Use PTR_ERR_OR_ZERO in 'nfs/nfs4super.c'

Use 'PTR_ERR_OR_ZERO()' rather than 'IS_ERR(...) ? PTR_ERR(...) : 0'.

Signed-off-by: Geyslan G. Bem <geyslan@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4super.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index e26acdd1a645..65ab0a0ca1c4 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -261,9 +261,9 @@ struct dentry *nfs4_try_mount(int flags, const char *dev_name,
 
 	res = nfs_follow_remote_path(root_mnt, export_path);
 
-	dfprintk(MOUNT, "<-- nfs4_try_mount() = %ld%s\n",
-			IS_ERR(res) ? PTR_ERR(res) : 0,
-			IS_ERR(res) ? " [error]" : "");
+	dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n",
+		 PTR_ERR_OR_ZERO(res),
+		 IS_ERR(res) ? " [error]" : "");
 	return res;
 }
 
@@ -319,9 +319,9 @@ static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
 	data->mnt_path = export_path;
 
 	res = nfs_follow_remote_path(root_mnt, export_path);
-	dprintk("<-- nfs4_referral_mount() = %ld%s\n",
-			IS_ERR(res) ? PTR_ERR(res) : 0,
-			IS_ERR(res) ? " [error]" : "");
+	dprintk("<-- nfs4_referral_mount() = %d%s\n",
+		PTR_ERR_OR_ZERO(res),
+		IS_ERR(res) ? " [error]" : "");
 	return res;
 }
 
-- 
cgit v1.3


From a3f432bfd06a4ec3b812e32d3266e0d1ad75d008 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 15 Oct 2013 17:03:16 -0400
Subject: nfs: use IS_ROOT not DCACHE_DISCONNECTED

This check was added by Al Viro with
d9e80b7de91db05c1c4d2e5ebbfd70b3b3ba0e0f "nfs d_revalidate() is too
trigger-happy with d_drop()", with the explanation that we don't want to
remove the root of a disconnected tree, which will still be included on
the s_anon list.

But DCACHE_DISCONNECTED does *not* actually identify dentries that are
disconnected from the dentry tree or hashed on s_anon.  IS_ROOT() is the
way to do that.

Also add a comment from Al's commit to remind us why this check is
there.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 02b0df769e2d..6cc51ae0638d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1139,7 +1139,13 @@ out_zap_parent:
 	if (inode && S_ISDIR(inode->i_mode)) {
 		/* Purge readdir caches. */
 		nfs_zap_caches(inode);
-		if (dentry->d_flags & DCACHE_DISCONNECTED)
+		/*
+		 * We can't d_drop the root of a disconnected tree:
+		 * its d_hash is on the s_anon list and d_drop() would hide
+		 * it from shrink_dcache_for_unmount(), leading to busy
+		 * inodes on unmount and further oopses.
+		 */
+		if (IS_ROOT(dentry))
 			goto out_valid;
 	}
 	/* If we have submounts, don't unhash ! */
-- 
cgit v1.3


From 3b218e3a21ccec183472015f1d7168400b187a58 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Tue, 29 Oct 2013 15:43:01 +0900
Subject: f2fs: introduce CONFIG_F2FS_CHECK_FS for BUG_ON control

This config will support an option to remove so many BUG_ONs that degrade
the performance potentially.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/Kconfig | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index e06e0995e00f..214fe1054fce 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -63,3 +63,11 @@ config F2FS_FS_SECURITY
 	  the extended attribute support in advance.
 
 	  If you are not using a security module, say N.
+
+config F2FS_CHECK_FS
+	bool "F2FS consistency checking feature"
+	depends on F2FS_FS
+	help
+	  Enables BUG_ONs which check the file system consistency in runtime.
+
+	  If you want to improve the performance, say N.
-- 
cgit v1.3


From 5d56b6718a0f4e5c58cdd3cb6b7a472d7c5671b9 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Tue, 29 Oct 2013 15:14:54 +0900
Subject: f2fs: add an option to avoid unnecessary BUG_ONs

If you want to remove unnecessary BUG_ONs, you can just turn off F2FS_CHECK_FS
in your kernel config.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 10 +++++-----
 fs/f2fs/data.c       |  4 ++--
 fs/f2fs/dir.c        |  2 +-
 fs/f2fs/f2fs.h       | 22 ++++++++++++++--------
 fs/f2fs/file.c       |  2 +-
 fs/f2fs/inode.c      |  2 +-
 fs/f2fs/node.c       | 42 +++++++++++++++++++++---------------------
 fs/f2fs/recovery.c   | 10 +++++-----
 fs/f2fs/segment.c    | 16 ++++++++--------
 fs/f2fs/segment.h    | 10 ++++++----
 fs/f2fs/xattr.c      |  2 +-
 11 files changed, 65 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 6526f5010621..d430157ffe60 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -142,8 +142,8 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 			lock_page(page);
-			BUG_ON(page->mapping != mapping);
-			BUG_ON(!PageDirty(page));
+			f2fs_bug_on(page->mapping != mapping);
+			f2fs_bug_on(!PageDirty(page));
 			clear_page_dirty_for_io(page);
 			if (f2fs_write_meta_page(page, &wbc)) {
 				unlock_page(page);
@@ -208,7 +208,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi)
 void release_orphan_inode(struct f2fs_sb_info *sbi)
 {
 	mutex_lock(&sbi->orphan_inode_mutex);
-	BUG_ON(sbi->n_orphans == 0);
+	f2fs_bug_on(sbi->n_orphans == 0);
 	sbi->n_orphans--;
 	mutex_unlock(&sbi->orphan_inode_mutex);
 }
@@ -252,7 +252,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 		if (orphan->ino == ino) {
 			list_del(&orphan->list);
 			kmem_cache_free(orphan_entry_slab, orphan);
-			BUG_ON(sbi->n_orphans == 0);
+			f2fs_bug_on(sbi->n_orphans == 0);
 			sbi->n_orphans--;
 			break;
 		}
@@ -263,7 +263,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 {
 	struct inode *inode = f2fs_iget(sbi->sb, ino);
-	BUG_ON(IS_ERR(inode));
+	f2fs_bug_on(IS_ERR(inode));
 	clear_nlink(inode);
 
 	/* truncate all the data during iput */
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index c8887d847dcf..aa3438c571fa 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -110,7 +110,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
 	pgoff_t fofs, start_fofs, end_fofs;
 	block_t start_blkaddr, end_blkaddr;
 
-	BUG_ON(blk_addr == NEW_ADDR);
+	f2fs_bug_on(blk_addr == NEW_ADDR);
 	fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
 							dn->ofs_in_node;
 
@@ -436,7 +436,7 @@ static int get_data_block_ro(struct inode *inode, sector_t iblock,
 	}
 
 	/* It does not support data allocation */
-	BUG_ON(create);
+	f2fs_bug_on(create);
 
 	if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) {
 		int i;
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index c9d53fc510c5..594fc1bb64ef 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -139,7 +139,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
 	bool room = false;
 	int max_slots = 0;
 
-	BUG_ON(level > MAX_DIR_HASH_DEPTH);
+	f2fs_bug_on(level > MAX_DIR_HASH_DEPTH);
 
 	nbucket = dir_buckets(level);
 	nblock = bucket_blocks(level);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index a61cc5f84638..6aaefdbaaac0 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -20,6 +20,12 @@
 #include <linux/kobject.h>
 #include <linux/sched.h>
 
+#ifdef CONFIG_F2FS_CHECK_FS
+#define f2fs_bug_on(condition)	BUG_ON(condition)
+#else
+#define f2fs_bug_on(condition)
+#endif
+
 /*
  * For mount options
  */
@@ -584,8 +590,8 @@ static inline int dec_valid_block_count(struct f2fs_sb_info *sbi,
 						blkcnt_t count)
 {
 	spin_lock(&sbi->stat_lock);
-	BUG_ON(sbi->total_valid_block_count < (block_t) count);
-	BUG_ON(inode->i_blocks < count);
+	f2fs_bug_on(sbi->total_valid_block_count < (block_t) count);
+	f2fs_bug_on(inode->i_blocks < count);
 	inode->i_blocks -= count;
 	sbi->total_valid_block_count -= (block_t)count;
 	spin_unlock(&sbi->stat_lock);
@@ -717,9 +723,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
 {
 	spin_lock(&sbi->stat_lock);
 
-	BUG_ON(sbi->total_valid_block_count < count);
-	BUG_ON(sbi->total_valid_node_count < count);
-	BUG_ON(inode->i_blocks < count);
+	f2fs_bug_on(sbi->total_valid_block_count < count);
+	f2fs_bug_on(sbi->total_valid_node_count < count);
+	f2fs_bug_on(inode->i_blocks < count);
 
 	inode->i_blocks -= count;
 	sbi->total_valid_node_count -= count;
@@ -740,7 +746,7 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
 static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
 {
 	spin_lock(&sbi->stat_lock);
-	BUG_ON(sbi->total_valid_inode_count == sbi->total_node_count);
+	f2fs_bug_on(sbi->total_valid_inode_count == sbi->total_node_count);
 	sbi->total_valid_inode_count++;
 	spin_unlock(&sbi->stat_lock);
 }
@@ -748,7 +754,7 @@ static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
 static inline int dec_valid_inode_count(struct f2fs_sb_info *sbi)
 {
 	spin_lock(&sbi->stat_lock);
-	BUG_ON(!sbi->total_valid_inode_count);
+	f2fs_bug_on(!sbi->total_valid_inode_count);
 	sbi->total_valid_inode_count--;
 	spin_unlock(&sbi->stat_lock);
 	return 0;
@@ -769,7 +775,7 @@ static inline void f2fs_put_page(struct page *page, int unlock)
 		return;
 
 	if (unlock) {
-		BUG_ON(!PageLocked(page));
+		f2fs_bug_on(!PageLocked(page));
 		unlock_page(page);
 	}
 	page_cache_release(page);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 2d4190a4abe3..58ed19aaa4c6 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -296,7 +296,7 @@ static int truncate_blocks(struct inode *inode, u64 from)
 		count = ADDRS_PER_BLOCK;
 
 	count -= dn.ofs_in_node;
-	BUG_ON(count < 0);
+	f2fs_bug_on(count < 0);
 
 	if (dn.ofs_in_node || IS_INODE(dn.node_page)) {
 		truncate_data_blocks_range(&dn, count);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 7377ca3ce5c5..d0eaa9faeca0 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -266,7 +266,7 @@ void f2fs_evict_inode(struct inode *inode)
 			inode->i_ino == F2FS_META_INO(sbi))
 		goto no_delete;
 
-	BUG_ON(atomic_read(&F2FS_I(inode)->dirty_dents));
+	f2fs_bug_on(atomic_read(&F2FS_I(inode)->dirty_dents));
 	remove_dirty_dir_inode(inode);
 
 	if (inode->i_nlink || is_bad_inode(inode))
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 304d5ced6355..8e331d5b9fcd 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -204,7 +204,7 @@ retry:
 		}
 		e->ni = *ni;
 		e->checkpointed = true;
-		BUG_ON(ni->blk_addr == NEW_ADDR);
+		f2fs_bug_on(ni->blk_addr == NEW_ADDR);
 	} else if (new_blkaddr == NEW_ADDR) {
 		/*
 		 * when nid is reallocated,
@@ -212,19 +212,19 @@ retry:
 		 * So, reinitialize it with new information.
 		 */
 		e->ni = *ni;
-		BUG_ON(ni->blk_addr != NULL_ADDR);
+		f2fs_bug_on(ni->blk_addr != NULL_ADDR);
 	}
 
 	if (new_blkaddr == NEW_ADDR)
 		e->checkpointed = false;
 
 	/* sanity check */
-	BUG_ON(nat_get_blkaddr(e) != ni->blk_addr);
-	BUG_ON(nat_get_blkaddr(e) == NULL_ADDR &&
+	f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr);
+	f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR &&
 			new_blkaddr == NULL_ADDR);
-	BUG_ON(nat_get_blkaddr(e) == NEW_ADDR &&
+	f2fs_bug_on(nat_get_blkaddr(e) == NEW_ADDR &&
 			new_blkaddr == NEW_ADDR);
-	BUG_ON(nat_get_blkaddr(e) != NEW_ADDR &&
+	f2fs_bug_on(nat_get_blkaddr(e) != NEW_ADDR &&
 			nat_get_blkaddr(e) != NULL_ADDR &&
 			new_blkaddr == NEW_ADDR);
 
@@ -495,10 +495,10 @@ static void truncate_node(struct dnode_of_data *dn)
 
 	get_node_info(sbi, dn->nid, &ni);
 	if (dn->inode->i_blocks == 0) {
-		BUG_ON(ni.blk_addr != NULL_ADDR);
+		f2fs_bug_on(ni.blk_addr != NULL_ADDR);
 		goto invalidate;
 	}
-	BUG_ON(ni.blk_addr == NULL_ADDR);
+	f2fs_bug_on(ni.blk_addr == NULL_ADDR);
 
 	/* Deallocate node address */
 	invalidate_blocks(sbi, ni.blk_addr);
@@ -822,7 +822,7 @@ int remove_inode_page(struct inode *inode)
 	}
 
 	/* 0 is possible, after f2fs_new_inode() is failed */
-	BUG_ON(inode->i_blocks != 0 && inode->i_blocks != 1);
+	f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1);
 	set_new_dnode(&dn, inode, page, page, ino);
 	truncate_node(&dn);
 	return 0;
@@ -863,7 +863,7 @@ struct page *new_node_page(struct dnode_of_data *dn,
 	get_node_info(sbi, dn->nid, &old_ni);
 
 	/* Reinitialize old_ni with new node page */
-	BUG_ON(old_ni.blk_addr != NULL_ADDR);
+	f2fs_bug_on(old_ni.blk_addr != NULL_ADDR);
 	new_ni = old_ni;
 	new_ni.ino = dn->inode->i_ino;
 	set_node_addr(sbi, &new_ni, NEW_ADDR);
@@ -969,7 +969,7 @@ repeat:
 		goto repeat;
 	}
 got_it:
-	BUG_ON(nid != nid_of_node(page));
+	f2fs_bug_on(nid != nid_of_node(page));
 	mark_page_accessed(page);
 	return page;
 }
@@ -1163,7 +1163,7 @@ static int f2fs_write_node_page(struct page *page,
 
 	/* get old block addr of this node page */
 	nid = nid_of_node(page);
-	BUG_ON(page->index != nid);
+	f2fs_bug_on(page->index != nid);
 
 	get_node_info(sbi, nid, &ni);
 
@@ -1349,7 +1349,7 @@ static void scan_nat_page(struct f2fs_nm_info *nm_i,
 			break;
 
 		blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
-		BUG_ON(blk_addr == NEW_ADDR);
+		f2fs_bug_on(blk_addr == NEW_ADDR);
 		if (blk_addr == NULL_ADDR) {
 			if (add_free_nid(nm_i, start_nid, true) < 0)
 				break;
@@ -1420,14 +1420,14 @@ retry:
 
 	/* We should not use stale free nids created by build_free_nids */
 	if (nm_i->fcnt && !sbi->on_build_free_nids) {
-		BUG_ON(list_empty(&nm_i->free_nid_list));
+		f2fs_bug_on(list_empty(&nm_i->free_nid_list));
 		list_for_each(this, &nm_i->free_nid_list) {
 			i = list_entry(this, struct free_nid, list);
 			if (i->state == NID_NEW)
 				break;
 		}
 
-		BUG_ON(i->state != NID_NEW);
+		f2fs_bug_on(i->state != NID_NEW);
 		*nid = i->nid;
 		i->state = NID_ALLOC;
 		nm_i->fcnt--;
@@ -1455,7 +1455,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
 
 	spin_lock(&nm_i->free_nid_list_lock);
 	i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
-	BUG_ON(!i || i->state != NID_ALLOC);
+	f2fs_bug_on(!i || i->state != NID_ALLOC);
 	__del_from_free_nid_list(i);
 	spin_unlock(&nm_i->free_nid_list_lock);
 }
@@ -1473,7 +1473,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
 
 	spin_lock(&nm_i->free_nid_list_lock);
 	i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
-	BUG_ON(!i || i->state != NID_ALLOC);
+	f2fs_bug_on(!i || i->state != NID_ALLOC);
 	if (nm_i->fcnt > 2 * MAX_FREE_NIDS) {
 		__del_from_free_nid_list(i);
 	} else {
@@ -1676,7 +1676,7 @@ to_nat_page:
 			nat_blk = page_address(page);
 		}
 
-		BUG_ON(!nat_blk);
+		f2fs_bug_on(!nat_blk);
 		raw_ne = nat_blk->entries[nid - start_nid];
 flush_now:
 		new_blkaddr = nat_get_blkaddr(ne);
@@ -1780,11 +1780,11 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
 	/* destroy free nid list */
 	spin_lock(&nm_i->free_nid_list_lock);
 	list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
-		BUG_ON(i->state == NID_ALLOC);
+		f2fs_bug_on(i->state == NID_ALLOC);
 		__del_from_free_nid_list(i);
 		nm_i->fcnt--;
 	}
-	BUG_ON(nm_i->fcnt);
+	f2fs_bug_on(nm_i->fcnt);
 	spin_unlock(&nm_i->free_nid_list_lock);
 
 	/* destroy nat cache */
@@ -1798,7 +1798,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
 			__del_from_nat_cache(nm_i, e);
 		}
 	}
-	BUG_ON(nm_i->nat_cnt);
+	f2fs_bug_on(nm_i->nat_cnt);
 	write_unlock(&nm_i->nat_tree_lock);
 
 	kfree(nm_i->nat_bitmap);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index b278c68b3e08..fdc81161f254 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -311,8 +311,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 	wait_on_page_writeback(dn.node_page);
 
 	get_node_info(sbi, dn.nid, &ni);
-	BUG_ON(ni.ino != ino_of_node(page));
-	BUG_ON(ofs_of_node(dn.node_page) != ofs_of_node(page));
+	f2fs_bug_on(ni.ino != ino_of_node(page));
+	f2fs_bug_on(ofs_of_node(dn.node_page) != ofs_of_node(page));
 
 	for (; start < end; start++) {
 		block_t src, dest;
@@ -322,9 +322,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 
 		if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR) {
 			if (src == NULL_ADDR) {
-				int err = reserve_new_block(&dn);
+				err = reserve_new_block(&dn);
 				/* We should not get -ENOSPC */
-				BUG_ON(err);
+				f2fs_bug_on(err);
 			}
 
 			/* Check the previous node page having this index */
@@ -447,7 +447,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
 
 	/* step #2: recover data */
 	err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
-	BUG_ON(!list_empty(&inode_list));
+	f2fs_bug_on(!list_empty(&inode_list));
 out:
 	destroy_fsync_dnodes(&inode_list);
 	kmem_cache_destroy(fsync_entry_slab);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 8f92c18a01fb..c7161defff90 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -192,7 +192,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
 	new_vblocks = se->valid_blocks + del;
 	offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1);
 
-	BUG_ON((new_vblocks >> (sizeof(unsigned short) << 3) ||
+	f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) ||
 				(new_vblocks > sbi->blocks_per_seg)));
 
 	se->valid_blocks = new_vblocks;
@@ -232,7 +232,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
 	unsigned int segno = GET_SEGNO(sbi, addr);
 	struct sit_info *sit_i = SIT_I(sbi);
 
-	BUG_ON(addr == NULL_ADDR);
+	f2fs_bug_on(addr == NULL_ADDR);
 	if (addr == NEW_ADDR)
 		return;
 
@@ -347,7 +347,7 @@ find_other_zone:
 		if (dir == ALLOC_RIGHT) {
 			secno = find_next_zero_bit(free_i->free_secmap,
 							TOTAL_SECS(sbi), 0);
-			BUG_ON(secno >= TOTAL_SECS(sbi));
+			f2fs_bug_on(secno >= TOTAL_SECS(sbi));
 		} else {
 			go_left = 1;
 			left_start = hint - 1;
@@ -363,7 +363,7 @@ find_other_zone:
 		}
 		left_start = find_next_zero_bit(free_i->free_secmap,
 							TOTAL_SECS(sbi), 0);
-		BUG_ON(left_start >= TOTAL_SECS(sbi));
+		f2fs_bug_on(left_start >= TOTAL_SECS(sbi));
 		break;
 	}
 	secno = left_start;
@@ -402,7 +402,7 @@ skip_left:
 	}
 got_it:
 	/* set it as dirty segment in free segmap */
-	BUG_ON(test_bit(segno, free_i->free_segmap));
+	f2fs_bug_on(test_bit(segno, free_i->free_segmap));
 	__set_inuse(sbi, segno);
 	*newseg = segno;
 	write_unlock(&free_i->segmap_lock);
@@ -773,7 +773,7 @@ static int __get_segment_type(struct page *page, enum page_type p_type)
 		return __get_segment_type_4(page, p_type);
 	}
 	/* NR_CURSEG_TYPE(6) logs by default */
-	BUG_ON(sbi->active_logs != NR_CURSEG_TYPE);
+	f2fs_bug_on(sbi->active_logs != NR_CURSEG_TYPE);
 	return __get_segment_type_6(page, p_type);
 }
 
@@ -850,7 +850,7 @@ void write_data_page(struct inode *inode, struct page *page,
 	struct f2fs_summary sum;
 	struct node_info ni;
 
-	BUG_ON(old_blkaddr == NULL_ADDR);
+	f2fs_bug_on(old_blkaddr == NULL_ADDR);
 	get_node_info(sbi, dn->nid, &ni);
 	set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
 
@@ -1240,7 +1240,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
 	/* get current sit block page without lock */
 	src_page = get_meta_page(sbi, src_off);
 	dst_page = grab_meta_page(sbi, dst_off);
-	BUG_ON(PageDirty(src_page));
+	f2fs_bug_on(PageDirty(src_page));
 
 	src_addr = page_address(src_page);
 	dst_addr = page_address(dst_page);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index abe7094c4f7a..001d4c4d97fd 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -522,16 +522,13 @@ static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type)
 	return curseg->next_blkoff;
 }
 
+#ifdef CONFIG_F2FS_CHECK_FS
 static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
 {
 	unsigned int end_segno = SM_I(sbi)->segment_count - 1;
 	BUG_ON(segno > end_segno);
 }
 
-/*
- * This function is used for only debugging.
- * NOTE: In future, we have to remove this function.
- */
 static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
 {
 	struct f2fs_sm_info *sm_info = SM_I(sbi);
@@ -565,6 +562,11 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
 			valid_blocks++;
 	BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks);
 }
+#else
+#define check_seg_range(sbi, segno)
+#define verify_block_addr(sbi, blk_addr)
+#define check_block_count(sbi, segno, raw_sit)
+#endif
 
 static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
 						unsigned int start)
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index f685138dd496..89d506d0b35d 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -372,7 +372,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 			alloc_nid_failed(sbi, new_nid);
 			return PTR_ERR(xpage);
 		}
-		BUG_ON(new_nid);
+		f2fs_bug_on(new_nid);
 	} else {
 		struct dnode_of_data dn;
 		set_new_dnode(&dn, inode, NULL, NULL, new_nid);
-- 
cgit v1.3


From cc3de6a3acce264f4eb0b5bf552478e5f1380bba Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Tue, 29 Oct 2013 14:17:05 +0800
Subject: f2fs: fix calculating incorrect free size when update xattr in
 __f2fs_setxattr

During xattr updating, free size should be corrected to remainder free size
+ old entry size.
It can avoid ENOSPC error when we update old entry with the same size new
entry at fully filled xattr.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/xattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 89d506d0b35d..aa7a3f139fe5 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -520,7 +520,7 @@ static int __f2fs_setxattr(struct inode *inode, int name_index,
 		 */
 		free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr);
 		if (found)
-			free = free - ENTRY_SIZE(here);
+			free = free + ENTRY_SIZE(here);
 
 		if (free < newsize) {
 			error = -ENOSPC;
-- 
cgit v1.3


From 01a87d91fcecced2e2bdcd797794b4ede1f64b77 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@primarydata.com>
Date: Mon, 14 Oct 2013 09:01:55 +0300
Subject: nfsd: no need to unhash_stid before free

idr_remove is about to be called before kmem_cache_free so unhashing it
is redundant

Signed-off-by: Benny Halevy <bhalevy@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 77dfc5a6a011..a90949aefe80 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -672,7 +672,6 @@ static void unhash_open_stateid(struct nfs4_ol_stateid *stp)
 static void release_open_stateid(struct nfs4_ol_stateid *stp)
 {
 	unhash_open_stateid(stp);
-	unhash_stid(&stp->st_stid);
 	free_generic_stateid(stp);
 }
 
@@ -694,7 +693,6 @@ static void release_last_closed_stateid(struct nfs4_openowner *oo)
 	struct nfs4_ol_stateid *s = oo->oo_last_closed_stid;
 
 	if (s) {
-		unhash_stid(&s->st_stid);
 		free_generic_stateid(s);
 		oo->oo_last_closed_stid = NULL;
 	}
@@ -3998,10 +3996,9 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfsd4_close_open_stateid(stp);
 
-	if (cstate->minorversion) {
-		unhash_stid(&stp->st_stid);
+	if (cstate->minorversion)
 		free_generic_stateid(stp);
-	} else
+	else
 		oo->oo_last_closed_stid = stp;
 
 	if (list_empty(&oo->oo_owner.so_stateids)) {
-- 
cgit v1.3


From 956c4fee446c568ad102625da931e259f22c67ee Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@primarydata.com>
Date: Tue, 29 Oct 2013 11:39:12 +0200
Subject: nfsd4: need to destroy revoked delegations in destroy_client

[use list_splice_init]
Signed-off-by: Benny Halevy <bhalevy@primarydata.com>
[bfields: no need for recall_lock here]
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a90949aefe80..21eb6788cb33 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1129,6 +1129,11 @@ destroy_client(struct nfs4_client *clp)
 		dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
 		destroy_delegation(dp);
 	}
+	list_splice_init(&clp->cl_revoked, &reaplist);
+	while (!list_empty(&reaplist)) {
+		dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
+		destroy_revoked_delegation(dp);
+	}
 	while (!list_empty(&clp->cl_openowners)) {
 		oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient);
 		release_openowner(oo);
-- 
cgit v1.3


From 49e7372063a220651b4f12fa46113559533cafee Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 12 Sep 2013 09:31:39 -0400
Subject: nfsd: fh_update should error out in unexpected cases
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The reporter saw a NULL dereference when a filesystem's ->mknod returned
success but left the dentry negative, and then nfsd tried to dereference
d_inode (in this case because the CREATE was followed by a GETATTR in
the same nfsv4 compound).

fh_update already checks for this and another broken case, but for some
reason it returns success and leaves nfsd trying to soldier on.  If it
failed we'd avoid the crash.  There's only so much we can do with a
buggy filesystem, but it's easy enough to bail out here, so let's do
that.

Reported-by: Antti Tönkyrä <daedalus@pingtimeout.net>
Tested-by: Antti Tönkyrä <daedalus@pingtimeout.net>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfsfh.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 3d0e15ae6f72..3c37b160dcad 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -598,22 +598,20 @@ fh_update(struct svc_fh *fhp)
 		_fh_update_old(dentry, fhp->fh_export, &fhp->fh_handle);
 	} else {
 		if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT)
-			goto out;
+			return 0;
 
 		_fh_update(fhp, fhp->fh_export, dentry);
 		if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)
 			return nfserr_opnotsupp;
 	}
-out:
 	return 0;
-
 out_bad:
 	printk(KERN_ERR "fh_update: fh not verified!\n");
-	goto out;
+	return nfserr_serverfault;
 out_negative:
 	printk(KERN_ERR "fh_update: %pd2 still negative!\n",
 		dentry);
-	goto out;
+	return nfserr_serverfault;
 }
 
 /*
-- 
cgit v1.3


From 427d6c6646d868fbd3094e7e2e1644d480cd9204 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 13 Sep 2013 11:08:45 -0400
Subject: nfsd: return better errors to exportfs

Someone noticed exportfs happily accepted exports that would later be
rejected when mountd tried to give them to the kernel.  Fix this.

This is a regression from 4c1e1b34d5c800ad3ac9a7e2805b0bea70ad2278
"nfsd: Store ex_anon_uid and ex_anon_gid as kuids and kgids".

Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: stable@vger.kernel.org
Reported-by: Yin.JianHong <jiyin@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/export.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 5f38ea36e266..af51cf9bf2e3 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -536,16 +536,12 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 		if (err)
 			goto out3;
 		exp.ex_anon_uid= make_kuid(&init_user_ns, an_int);
-		if (!uid_valid(exp.ex_anon_uid))
-			goto out3;
 
 		/* anon gid */
 		err = get_int(&mesg, &an_int);
 		if (err)
 			goto out3;
 		exp.ex_anon_gid= make_kgid(&init_user_ns, an_int);
-		if (!gid_valid(exp.ex_anon_gid))
-			goto out3;
 
 		/* fsid */
 		err = get_int(&mesg, &an_int);
@@ -583,6 +579,17 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 				   exp.ex_uuid);
 		if (err)
 			goto out4;
+		/*
+		 * For some reason exportfs has been passing down an
+		 * invalid (-1) uid & gid on the "dummy" export which it
+		 * uses to test export support.  To make sure exportfs
+		 * sees errors from check_export we therefore need to
+		 * delay these checks till after check_export:
+		 */
+		if (!uid_valid(exp.ex_anon_uid))
+			goto out4;
+		if (!gid_valid(exp.ex_anon_gid))
+			goto out4;
 	}
 
 	expp = svc_export_lookup(&exp);
-- 
cgit v1.3


From 6f6cc3205c5f10129b8a10cdf8abf85d9db48a60 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 13 Sep 2013 17:50:42 -0400
Subject: nfsd: -EINVAL on invalid anonuid/gid instead of silent failure

If we're going to refuse to accept these it would be polite of us to at
least say so....

This introduces a slight complication since we need to grandfather in
exportfs's ill-advised use of -1 uid and gid on its test_export.

If it turns out there are other users passing down -1 we may need to
do something else.

Best might be to drop the checks entirely, but I'm not sure if other
parts of the kernel might assume that a task can't run as uid or gid -1.

Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/export.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index af51cf9bf2e3..8513c598fabf 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -579,6 +579,13 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 				   exp.ex_uuid);
 		if (err)
 			goto out4;
+		/*
+		 * No point caching this if it would immediately expire.
+		 * Also, this protects exportfs's dummy export from the
+		 * anon_uid/anon_gid checks:
+		 */
+		if (exp.h.expiry_time < seconds_since_boot())
+			goto out4;
 		/*
 		 * For some reason exportfs has been passing down an
 		 * invalid (-1) uid & gid on the "dummy" export which it
@@ -586,10 +593,12 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 		 * sees errors from check_export we therefore need to
 		 * delay these checks till after check_export:
 		 */
+		err = -EINVAL;
 		if (!uid_valid(exp.ex_anon_uid))
 			goto out4;
 		if (!gid_valid(exp.ex_anon_gid))
 			goto out4;
+		err = 0;
 	}
 
 	expp = svc_export_lookup(&exp);
-- 
cgit v1.3


From 56b3f3b884652395e1025a8e4f1c4bd47bc112c4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 24 Oct 2013 11:49:07 -0400
Subject: sysfs: merge sysfs_elem_bin_attr into sysfs_elem_attr

3124eb1679 ("sysfs: merge regular and bin file handling") folded bin
file handling into regular file handling.  Among other things, bin
file now shares the same open path including sysfs_open_dirent
association using sysfs_dirent->s_attr.open.  This is buggy because
->s_bin_attr lives in the same union and doesn't have the field.  This
bug doesn't trigger because sysfs_elem_bin_attr doesn't have an active
field at the conflicting position.  It does have a field "buffers" but
it isn't used anymore.

This patch collapses sysfs_elem_bin_attr into sysfs_elem_attr so that
the bin_attr is accessed through ->s_attr.bin_attr which lives with
->s_attr.attr in an anonymous union.  The code paths already assume
bin_attr contains attr as the first element, so this doesn't add any
more assumptions while making it explicit that the two types are
handled together.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c  |  8 ++++----
 fs/sysfs/inode.c |  2 +-
 fs/sysfs/sysfs.h | 11 ++++-------
 3 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index c3795978b404..0d7368d43619 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -154,7 +154,7 @@ static ssize_t sysfs_bin_read(struct file *file, char __user *userbuf,
 			      size_t bytes, loff_t *off)
 {
 	struct sysfs_open_file *of = sysfs_of(file);
-	struct bin_attribute *battr = of->sd->s_bin_attr.bin_attr;
+	struct bin_attribute *battr = of->sd->s_attr.bin_attr;
 	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
 	loff_t size = file_inode(file)->i_size;
 	int count = min_t(size_t, bytes, PAGE_SIZE);
@@ -236,7 +236,7 @@ static int flush_write_buffer(struct sysfs_open_file *of, char *buf, loff_t off,
 	}
 
 	if (sysfs_is_bin(of->sd)) {
-		struct bin_attribute *battr = of->sd->s_bin_attr.bin_attr;
+		struct bin_attribute *battr = of->sd->s_attr.bin_attr;
 
 		rc = -EIO;
 		if (battr->write)
@@ -466,7 +466,7 @@ static const struct vm_operations_struct sysfs_bin_vm_ops = {
 static int sysfs_bin_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct sysfs_open_file *of = sysfs_of(file);
-	struct bin_attribute *battr = of->sd->s_bin_attr.bin_attr;
+	struct bin_attribute *battr = of->sd->s_attr.bin_attr;
 	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
 	int rc;
 
@@ -618,7 +618,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 		return -ENODEV;
 
 	if (sysfs_is_bin(attr_sd)) {
-		struct bin_attribute *battr = attr_sd->s_bin_attr.bin_attr;
+		struct bin_attribute *battr = attr_sd->s_attr.bin_attr;
 
 		has_read = battr->read || battr->mmap;
 		has_write = battr->write || battr->mmap;
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 2cb1b6b8ccbc..825c55607af8 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -258,7 +258,7 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 		inode->i_fop = &sysfs_file_operations;
 		break;
 	case SYSFS_KOBJ_BIN_ATTR:
-		bin_attr = sd->s_bin_attr.bin_attr;
+		bin_attr = sd->s_attr.bin_attr;
 		inode->i_size = bin_attr->size;
 		inode->i_fop = &sysfs_bin_operations;
 		break;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 94ac8fa34dd4..c095d952f3a5 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -29,15 +29,13 @@ struct sysfs_elem_symlink {
 };
 
 struct sysfs_elem_attr {
-	struct attribute	*attr;
+	union {
+		struct attribute	*attr;
+		struct bin_attribute	*bin_attr;
+	};
 	struct sysfs_open_dirent *open;
 };
 
-struct sysfs_elem_bin_attr {
-	struct bin_attribute	*bin_attr;
-	struct hlist_head	buffers;
-};
-
 struct sysfs_inode_attrs {
 	struct iattr	ia_iattr;
 	void		*ia_secdata;
@@ -74,7 +72,6 @@ struct sysfs_dirent {
 		struct sysfs_elem_dir		s_dir;
 		struct sysfs_elem_symlink	s_symlink;
 		struct sysfs_elem_attr		s_attr;
-		struct sysfs_elem_bin_attr	s_bin_attr;
 	};
 
 	unsigned short		s_flags;
-- 
cgit v1.3


From 672f76a81ad524359e761d7c863d24aec58e23c5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 24 Oct 2013 11:49:08 -0400
Subject: sysfs: honor bin_attr.attr.ignore_lockdep

ignore_lockdep is currently honored only for regular files.  There's
no reason to ignore it for bin files.  Update sysfs_ignore_lockdep()
so that bin_attr.attr.ignore_lockdep works too.

While this doesn't have any in-kernel user, this unifies the behaviors
between regular and bin files and will help later changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/sysfs.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index c095d952f3a5..bdce8458af3b 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -114,7 +114,9 @@ do {								\
 /* Test for attributes that want to ignore lockdep for read-locking */
 static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd)
 {
-	return sysfs_type(sd) == SYSFS_KOBJ_ATTR &&
+	int type = sysfs_type(sd);
+
+	return (type == SYSFS_KOBJ_ATTR || type == SYSFS_KOBJ_BIN_ATTR) &&
 		sd->s_attr.attr->ignore_lockdep;
 }
 
-- 
cgit v1.3


From baa97cb50724e72ece05a7cead6533a9658ddf79 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 24 Oct 2013 11:49:09 -0400
Subject: sysfs: remove unused sysfs_get_dentry() prototype

sysfs_get_dentry() has been gone for years now.  Remove the left-over
prototype.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/sysfs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index bdce8458af3b..e0753e5eff41 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -165,7 +165,6 @@ extern const struct dentry_operations sysfs_dentry_ops;
 extern const struct file_operations sysfs_dir_operations;
 extern const struct inode_operations sysfs_dir_inode_operations;
 
-struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd);
 struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
 void sysfs_put_active(struct sysfs_dirent *sd);
 void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt);
-- 
cgit v1.3


From 7eed6ecb0785681892ab1fe47188fc981241cfd0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 24 Oct 2013 11:49:10 -0400
Subject: sysfs: move sysfs_hash_and_remove() to fs/sysfs/dir.c

Most removal related logic is implemented in fs/sysfs/dir.c.  Move
sysfs_hash_and_remove() to fs/sysfs/dir.c so that __sysfs_remove()
doesn't have to be public.

This is pure relocation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c   | 38 +++++++++++++++++++++++++++++++++++++-
 fs/sysfs/inode.c | 26 --------------------------
 fs/sysfs/sysfs.h |  5 ++---
 3 files changed, 39 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index eab59de47556..486238d06021 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -813,7 +813,8 @@ static struct sysfs_dirent *sysfs_next_descendant_post(struct sysfs_dirent *pos,
 	return pos->s_parent;
 }
 
-void __sysfs_remove(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
+static void __sysfs_remove(struct sysfs_addrm_cxt *acxt,
+			   struct sysfs_dirent *sd)
 {
 	struct sysfs_dirent *pos, *next;
 
@@ -846,6 +847,41 @@ void sysfs_remove(struct sysfs_dirent *sd)
 	sysfs_addrm_finish(&acxt);
 }
 
+/**
+ * sysfs_hash_and_remove - find a sysfs_dirent by name and remove it
+ * @dir_sd: parent of the target
+ * @name: name of the sysfs_dirent to remove
+ * @ns: namespace tag of the sysfs_dirent to remove
+ *
+ * Look for the sysfs_dirent with @name and @ns under @dir_sd and remove
+ * it.  Returns 0 on success, -ENOENT if such entry doesn't exist.
+ */
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
+			  const void *ns)
+{
+	struct sysfs_addrm_cxt acxt;
+	struct sysfs_dirent *sd;
+
+	if (!dir_sd) {
+		WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
+			name);
+		return -ENOENT;
+	}
+
+	sysfs_addrm_start(&acxt);
+
+	sd = sysfs_find_dirent(dir_sd, name, ns);
+	if (sd)
+		__sysfs_remove(&acxt, sd);
+
+	sysfs_addrm_finish(&acxt);
+
+	if (sd)
+		return 0;
+	else
+		return -ENOENT;
+}
+
 /**
  *	sysfs_remove_dir - remove an object's directory.
  *	@kobj:	object.
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 825c55607af8..1750f790af3b 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -314,32 +314,6 @@ void sysfs_evict_inode(struct inode *inode)
 	sysfs_put(sd);
 }
 
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
-			  const void *ns)
-{
-	struct sysfs_addrm_cxt acxt;
-	struct sysfs_dirent *sd;
-
-	if (!dir_sd) {
-		WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
-			name);
-		return -ENOENT;
-	}
-
-	sysfs_addrm_start(&acxt);
-
-	sd = sysfs_find_dirent(dir_sd, name, ns);
-	if (sd)
-		__sysfs_remove(&acxt, sd);
-
-	sysfs_addrm_finish(&acxt);
-
-	if (sd)
-		return 0;
-	else
-		return -ENOENT;
-}
-
 int sysfs_permission(struct inode *inode, int mask)
 {
 	struct sysfs_dirent *sd;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index e0753e5eff41..8d3dc1ddb546 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -172,8 +172,9 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
 		    struct sysfs_dirent *parent_sd);
 int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
 		  struct sysfs_dirent *parent_sd);
-void __sysfs_remove(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
 void sysfs_remove(struct sysfs_dirent *sd);
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
+			  const void *ns);
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
 
 struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
@@ -218,8 +219,6 @@ int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat);
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		   size_t size, int flags);
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
-			  const void *ns);
 int sysfs_inode_init(void);
 
 /*
-- 
cgit v1.3


From d1c1459e45944e336a968acce1e459c9effcde47 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 24 Oct 2013 11:49:11 -0400
Subject: sysfs: separate out dup filename warning into a separate function

Separate out sysfs_warn_dup() out of sysfs_add_one().  This will help
separating out the core sysfs functionalities into kernfs so that it
can be used by non-sysfs users too.

This doesn't make any functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c   | 30 +++++++++++++++++++-----------
 fs/sysfs/sysfs.h |  1 +
 2 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 486238d06021..de47ed32d5c7 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -470,6 +470,23 @@ static char *sysfs_pathname(struct sysfs_dirent *sd, char *path)
 	return path;
 }
 
+void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name)
+{
+	char *path;
+
+	path = kzalloc(PATH_MAX, GFP_KERNEL);
+	if (path) {
+		sysfs_pathname(parent, path);
+		strlcat(path, "/", PATH_MAX);
+		strlcat(path, name, PATH_MAX);
+	}
+
+	WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s'\n",
+	     path ? path : name);
+
+	kfree(path);
+}
+
 /**
  *	sysfs_add_one - add sysfs_dirent to parent
  *	@acxt: addrm context to use
@@ -497,18 +514,9 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
 	int ret;
 
 	ret = __sysfs_add_one(acxt, sd, parent_sd);
-	if (ret == -EEXIST) {
-		char *path = kzalloc(PATH_MAX, GFP_KERNEL);
-		WARN(1, KERN_WARNING
-		     "sysfs: cannot create duplicate filename '%s'\n",
-		     (path == NULL) ? sd->s_name
-				    : (sysfs_pathname(parent_sd, path),
-				       strlcat(path, "/", PATH_MAX),
-				       strlcat(path, sd->s_name, PATH_MAX),
-				       path));
-		kfree(path);
-	}
 
+	if (ret == -EEXIST)
+		sysfs_warn_dup(parent_sd, sd->s_name);
 	return ret;
 }
 
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 8d3dc1ddb546..05d063fe69c7 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -168,6 +168,7 @@ extern const struct inode_operations sysfs_dir_inode_operations;
 struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
 void sysfs_put_active(struct sysfs_dirent *sd);
 void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt);
+void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name);
 int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
 		    struct sysfs_dirent *parent_sd);
 int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-- 
cgit v1.3


From 9a47938b226cc2b8e2afd72b0f1ca1a7e1367cf5 Mon Sep 17 00:00:00 2001
From: Fan Li <fanofcode.li@samsung.com>
Date: Tue, 29 Oct 2013 16:21:47 +0800
Subject: f2fs: change the method of calculating the number summary blocks

npages_for_summary_flush uses (SUMMARY_SIZE + 1) as the size of a f2fs_summary
while its actual size is  SUMMARY_SIZE. So the result sometimes is bigger than
actual number by one, which causes checkpoint can't be written into disk
contiguously, and sometimes summary blocks can't be compacted like they should.
Besides, when writing summary blocks into pages, if remain space in a page
isn't big enough for one f2fs_summary, it will be left unused, current code
seems not to take it into account.

Signed-off-by: Fan Li <fanofcode.li@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/segment.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index c7161defff90..3d4d5fc19e6f 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -264,9 +264,8 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
  */
 int npages_for_summary_flush(struct f2fs_sb_info *sbi)
 {
-	int total_size_bytes = 0;
 	int valid_sum_count = 0;
-	int i, sum_space;
+	int i, sum_in_page;
 
 	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
 		if (sbi->ckpt->alloc_type[i] == SSR)
@@ -275,13 +274,12 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi)
 			valid_sum_count += curseg_blkoff(sbi, i);
 	}
 
-	total_size_bytes = valid_sum_count * (SUMMARY_SIZE + 1)
-			+ sizeof(struct nat_journal) + 2
-			+ sizeof(struct sit_journal) + 2;
-	sum_space = PAGE_CACHE_SIZE - SUM_FOOTER_SIZE;
-	if (total_size_bytes < sum_space)
+	sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE -
+			SUM_FOOTER_SIZE) / SUMMARY_SIZE;
+	if (valid_sum_count <= sum_in_page)
 		return 1;
-	else if (total_size_bytes < 2 * sum_space)
+	else if ((valid_sum_count - sum_in_page) <=
+		(PAGE_CACHE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE)
 		return 2;
 	return 3;
 }
-- 
cgit v1.3


From 44c60bf2b9f2808e127fa4cd3b11b50a46cf5817 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Tue, 29 Oct 2013 14:50:40 +0800
Subject: f2fs: check all ones or zeros bitmap with bitops for better mount
 performance

Previously, check_block_count check valid_map with bit data type in common
scenario that sit has all ones or zeros bitmap, it makes low mount performance.
So let's check the special bitmap with integer data type instead of the bit one.

v1-->v2:
 o use find_next_{zero_}bit_le for better performance and readable as Jaegeuk
   suggested.
 o use neat logogram in comment as Gu Zheng suggested.
 o search continuous ones or zeros for better performance when checking mixed
   bitmap.

Suggested-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Signed-off-by: Shu Tan <shu.tan@samsung.com>
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/segment.h | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 001d4c4d97fd..269f690b4e24 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -547,8 +547,9 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
 {
 	struct f2fs_sm_info *sm_info = SM_I(sbi);
 	unsigned int end_segno = sm_info->segment_count - 1;
+	bool is_valid  = test_bit_le(0, raw_sit->valid_map) ? true : false;
 	int valid_blocks = 0;
-	int i;
+	int cur_pos = 0, next_pos;
 
 	/* check segment usage */
 	BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg);
@@ -557,9 +558,19 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
 	BUG_ON(segno > end_segno);
 
 	/* check bitmap with valid block count */
-	for (i = 0; i < sbi->blocks_per_seg; i++)
-		if (f2fs_test_bit(i, raw_sit->valid_map))
-			valid_blocks++;
+	do {
+		if (is_valid) {
+			next_pos = find_next_zero_bit_le(&raw_sit->valid_map,
+					sbi->blocks_per_seg,
+					cur_pos);
+			valid_blocks += next_pos - cur_pos;
+		} else
+			next_pos = find_next_bit_le(&raw_sit->valid_map,
+					sbi->blocks_per_seg,
+					cur_pos);
+		cur_pos = next_pos;
+		is_valid = !is_valid;
+	} while (cur_pos < sbi->blocks_per_seg);
 	BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks);
 }
 #else
-- 
cgit v1.3


From bbf023c74dcf380769aec00aedd706a266b7d9ef Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Wed, 30 Oct 2013 07:27:16 -0400
Subject: ext4: pair trace_ext4_writepages & trace_ext4_writepages_result

Pair the two trace events to make troubeshooting writepages
easier, and it should be more convinient to write a simple script
to parse the traces.

Cc: linux-ext4@vger.kernel.org
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 94aac67b55c9..43015fa69c3a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2423,16 +2423,15 @@ static int ext4_writepages(struct address_space *mapping,
 	 * because that could violate lock ordering on umount
 	 */
 	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
-		return 0;
+		goto out_writepages;
 
 	if (ext4_should_journal_data(inode)) {
 		struct blk_plug plug;
-		int ret;
 
 		blk_start_plug(&plug);
 		ret = write_cache_pages(mapping, wbc, __writepage, mapping);
 		blk_finish_plug(&plug);
-		return ret;
+		goto out_writepages;
 	}
 
 	/*
@@ -2445,8 +2444,10 @@ static int ext4_writepages(struct address_space *mapping,
 	 * *never* be called, so if that ever happens, we would want
 	 * the stack trace.
 	 */
-	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
-		return -EROFS;
+	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
+		ret = -EROFS;
+		goto out_writepages;
+	}
 
 	if (ext4_should_dioread_nolock(inode)) {
 		/*
-- 
cgit v1.3


From 48ffdab1c1eb87fee9ed7366472e2b9aa4577a60 Mon Sep 17 00:00:00 2001
From: BoxiLiu <lewis.liulei@huawei.com>
Date: Wed, 30 Oct 2013 08:07:20 -0400
Subject: ext4: change ext4_read_inline_dir() to return 0 on success

In ext4_read_inline_dir(), if there is inline data, the successful
return value is the return value of ext4_read_inline_data().  Howewer,
this is used by ext4_readdir(), and while it seems harmless to return
a positive value on success, it's inconsistent, since historically
we've always return 0 on success.

Signed-off-by: BoxiLiu <lewis.liulei@huawei.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Acked-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ext4/inline.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index d9ecbf1113a7..c3efb65d2ec6 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1442,6 +1442,7 @@ int ext4_read_inline_dir(struct file *file,
 	if (ret < 0)
 		goto out;
 
+	ret = 0;
 	sb = inode->i_sb;
 	parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
 	offset = ctx->pos;
-- 
cgit v1.3


From e1a90ebd8b2349eb00ec22f0b8bf6ab8bbd06cc8 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <bjschuma@netapp.com>
Date: Wed, 30 Oct 2013 10:00:20 -0400
Subject: NFSD: Combine decode operations for v4 and v4.1

We were using a different array of function pointers to represent each
minor version.  This makes adding a new minor version tedious, since it
needs a step to copy, paste and modify a new version of the same
functions.

This patch combines the v4 and v4.1 arrays into a single instance and
will check minor version support inside each decoder function.

Signed-off-by: Anna Schumaker <bjschuma@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 98 +++++++++++++++++++++++--------------------------------
 1 file changed, 40 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index d9454fe5653f..99bebea20668 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -945,13 +945,16 @@ static __be32
 nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf)
 {
 	DECODE_HEAD;
-		    
+
+	if (argp->minorversion >= 1)
+		return nfserr_notsupp;
+
 	status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
 	if (status)
 		return status;
 	READ_BUF(4);
 	READ32(open_conf->oc_seqid);
-						        
+
 	DECODE_TAIL;
 }
 
@@ -990,6 +993,14 @@ nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
 	DECODE_TAIL;
 }
 
+static __be32
+nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p)
+{
+	if (argp->minorversion == 0)
+		return nfs_ok;
+	return nfserr_notsupp;
+}
+
 static __be32
 nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
 {
@@ -1061,6 +1072,9 @@ nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid)
 {
 	DECODE_HEAD;
 
+	if (argp->minorversion >= 1)
+		return nfserr_notsupp;
+
 	READ_BUF(sizeof(clientid_t));
 	COPYMEM(clientid, sizeof(clientid_t));
 
@@ -1111,6 +1125,9 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
 {
 	DECODE_HEAD;
 
+	if (argp->minorversion >= 1)
+		return nfserr_notsupp;
+
 	READ_BUF(NFS4_VERIFIER_SIZE);
 	COPYMEM(setclientid->se_verf.data, NFS4_VERIFIER_SIZE);
 
@@ -1137,6 +1154,9 @@ nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_s
 {
 	DECODE_HEAD;
 
+	if (argp->minorversion >= 1)
+		return nfserr_notsupp;
+
 	READ_BUF(8 + NFS4_VERIFIER_SIZE);
 	COPYMEM(&scd_c->sc_clientid, 8);
 	COPYMEM(&scd_c->sc_confirm, NFS4_VERIFIER_SIZE);
@@ -1220,6 +1240,9 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
 {
 	DECODE_HEAD;
 
+	if (argp->minorversion >= 1)
+		return nfserr_notsupp;
+
 	READ_BUF(12);
 	COPYMEM(&rlockowner->rl_clientid, sizeof(clientid_t));
 	READ32(rlockowner->rl_owner.len);
@@ -1519,7 +1542,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
 	[OP_OPEN_CONFIRM]	= (nfsd4_dec)nfsd4_decode_open_confirm,
 	[OP_OPEN_DOWNGRADE]	= (nfsd4_dec)nfsd4_decode_open_downgrade,
 	[OP_PUTFH]		= (nfsd4_dec)nfsd4_decode_putfh,
-	[OP_PUTPUBFH]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_PUTPUBFH]		= (nfsd4_dec)nfsd4_decode_putpubfh,
 	[OP_PUTROOTFH]		= (nfsd4_dec)nfsd4_decode_noop,
 	[OP_READ]		= (nfsd4_dec)nfsd4_decode_read,
 	[OP_READDIR]		= (nfsd4_dec)nfsd4_decode_readdir,
@@ -1536,46 +1559,6 @@ static nfsd4_dec nfsd4_dec_ops[] = {
 	[OP_VERIFY]		= (nfsd4_dec)nfsd4_decode_verify,
 	[OP_WRITE]		= (nfsd4_dec)nfsd4_decode_write,
 	[OP_RELEASE_LOCKOWNER]	= (nfsd4_dec)nfsd4_decode_release_lockowner,
-};
-
-static nfsd4_dec nfsd41_dec_ops[] = {
-	[OP_ACCESS]		= (nfsd4_dec)nfsd4_decode_access,
-	[OP_CLOSE]		= (nfsd4_dec)nfsd4_decode_close,
-	[OP_COMMIT]		= (nfsd4_dec)nfsd4_decode_commit,
-	[OP_CREATE]		= (nfsd4_dec)nfsd4_decode_create,
-	[OP_DELEGPURGE]		= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_DELEGRETURN]	= (nfsd4_dec)nfsd4_decode_delegreturn,
-	[OP_GETATTR]		= (nfsd4_dec)nfsd4_decode_getattr,
-	[OP_GETFH]		= (nfsd4_dec)nfsd4_decode_noop,
-	[OP_LINK]		= (nfsd4_dec)nfsd4_decode_link,
-	[OP_LOCK]		= (nfsd4_dec)nfsd4_decode_lock,
-	[OP_LOCKT]		= (nfsd4_dec)nfsd4_decode_lockt,
-	[OP_LOCKU]		= (nfsd4_dec)nfsd4_decode_locku,
-	[OP_LOOKUP]		= (nfsd4_dec)nfsd4_decode_lookup,
-	[OP_LOOKUPP]		= (nfsd4_dec)nfsd4_decode_noop,
-	[OP_NVERIFY]		= (nfsd4_dec)nfsd4_decode_verify,
-	[OP_OPEN]		= (nfsd4_dec)nfsd4_decode_open,
-	[OP_OPENATTR]		= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_OPEN_CONFIRM]	= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_OPEN_DOWNGRADE]	= (nfsd4_dec)nfsd4_decode_open_downgrade,
-	[OP_PUTFH]		= (nfsd4_dec)nfsd4_decode_putfh,
-	[OP_PUTPUBFH]		= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_PUTROOTFH]		= (nfsd4_dec)nfsd4_decode_noop,
-	[OP_READ]		= (nfsd4_dec)nfsd4_decode_read,
-	[OP_READDIR]		= (nfsd4_dec)nfsd4_decode_readdir,
-	[OP_READLINK]		= (nfsd4_dec)nfsd4_decode_noop,
-	[OP_REMOVE]		= (nfsd4_dec)nfsd4_decode_remove,
-	[OP_RENAME]		= (nfsd4_dec)nfsd4_decode_rename,
-	[OP_RENEW]		= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_RESTOREFH]		= (nfsd4_dec)nfsd4_decode_noop,
-	[OP_SAVEFH]		= (nfsd4_dec)nfsd4_decode_noop,
-	[OP_SECINFO]		= (nfsd4_dec)nfsd4_decode_secinfo,
-	[OP_SETATTR]		= (nfsd4_dec)nfsd4_decode_setattr,
-	[OP_SETCLIENTID]	= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_SETCLIENTID_CONFIRM]= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_VERIFY]		= (nfsd4_dec)nfsd4_decode_verify,
-	[OP_WRITE]		= (nfsd4_dec)nfsd4_decode_write,
-	[OP_RELEASE_LOCKOWNER]	= (nfsd4_dec)nfsd4_decode_notsupp,
 
 	/* new operations for NFSv4.1 */
 	[OP_BACKCHANNEL_CTL]	= (nfsd4_dec)nfsd4_decode_backchannel_ctl,
@@ -1599,23 +1582,23 @@ static nfsd4_dec nfsd41_dec_ops[] = {
 	[OP_RECLAIM_COMPLETE]	= (nfsd4_dec)nfsd4_decode_reclaim_complete,
 };
 
-struct nfsd4_minorversion_ops {
-	nfsd4_dec *decoders;
-	int nops;
-};
-
-static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
-	[0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
-	[1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
-	[2] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
-};
+static inline bool
+nfsd4_opnum_in_range(struct nfsd4_compoundargs *argp, struct nfsd4_op *op)
+{
+	if (op->opnum < FIRST_NFS4_OP || op->opnum > LAST_NFS4_OP)
+		return false;
+	else if (argp->minorversion == 0 && op->opnum > OP_RELEASE_LOCKOWNER)
+		return false;
+	else if (argp->minorversion == 1 && op->opnum > OP_RECLAIM_COMPLETE)
+		return false;
+	return true;
+}
 
 static __be32
 nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 {
 	DECODE_HEAD;
 	struct nfsd4_op *op;
-	struct nfsd4_minorversion_ops *ops;
 	bool cachethis = false;
 	int i;
 
@@ -1640,10 +1623,9 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 		}
 	}
 
-	if (argp->minorversion >= ARRAY_SIZE(nfsd4_minorversion))
+	if (argp->minorversion > NFSD_SUPPORTED_MINOR_VERSION)
 		argp->opcnt = 0;
 
-	ops = &nfsd4_minorversion[argp->minorversion];
 	for (i = 0; i < argp->opcnt; i++) {
 		op = &argp->ops[i];
 		op->replay = NULL;
@@ -1651,8 +1633,8 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 		READ_BUF(4);
 		READ32(op->opnum);
 
-		if (op->opnum >= FIRST_NFS4_OP && op->opnum <= LAST_NFS4_OP)
-			op->status = ops->decoders[op->opnum](argp, &op->u);
+		if (nfsd4_opnum_in_range(argp, op))
+			op->status = nfsd4_dec_ops[op->opnum](argp, &op->u);
 		else {
 			op->opnum = OP_ILLEGAL;
 			op->status = nfserr_op_illegal;
-- 
cgit v1.3


From e50a26dc785f12a6df1f1ac16d62faf0342b72af Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 30 Oct 2013 10:33:09 -0400
Subject: nfsd4: nfsd_shutdown_net needs state lock

A comment claims the caller should take it, but that's not being done.
Note we don't want it around the cancel_delayed_work_sync since that may
wait on work which holds the client lock.

Reported-by: Benny Halevy <bhalevy@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 21eb6788cb33..e03e8efff53f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -5124,7 +5124,6 @@ out_recovery:
 	return ret;
 }
 
-/* should be called with the state lock held */
 void
 nfs4_state_shutdown_net(struct net *net)
 {
@@ -5135,6 +5134,7 @@ nfs4_state_shutdown_net(struct net *net)
 	cancel_delayed_work_sync(&nn->laundromat_work);
 	locks_end_grace(&nn->nfsd4_manager);
 
+	nfs4_lock_state();
 	INIT_LIST_HEAD(&reaplist);
 	spin_lock(&recall_lock);
 	list_for_each_safe(pos, next, &nn->del_recall_lru) {
@@ -5149,6 +5149,7 @@ nfs4_state_shutdown_net(struct net *net)
 
 	nfsd4_client_tracking_exit(net);
 	nfs4_state_destroy_net(net);
+	nfs4_unlock_state();
 }
 
 void
-- 
cgit v1.3


From 5ba052fe33807893a26076d2ac95cf31d5e0b578 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Wed, 30 Oct 2013 10:53:10 -0400
Subject: ext4: drop set but otherwise unused variable from
 ext4_add_dirent_to_inline()

Signed-off-by: Azat Khuzhin <a3at.mail@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inline.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index c3efb65d2ec6..bae987549dc3 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -994,11 +994,9 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
 	struct inode	*dir = dentry->d_parent->d_inode;
 	const char	*name = dentry->d_name.name;
 	int		namelen = dentry->d_name.len;
-	unsigned short	reclen;
 	int		err;
 	struct ext4_dir_entry_2 *de;
 
-	reclen = EXT4_DIR_REC_LEN(namelen);
 	err = ext4_find_dest_de(dir, inode, iloc->bh,
 				inline_start, inline_size,
 				name, namelen, &de);
-- 
cgit v1.3


From 8f9ff189205a6817aee5a1f996f876541f86e07c Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Wed, 30 Oct 2013 11:10:52 -0400
Subject: ext4: fix FITRIM in no journal mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When using FITRIM ioctl on a file system without journal it will
only trim the block group once, no matter how many times you invoke
FITRIM ioctl and how many block you release from the block group.

It is because we only clear EXT4_GROUP_INFO_WAS_TRIMMED_BIT in journal
callback. Fix this by clearing the bit in no journal mode as well.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reported-by: Jorge Fábregas <jorge.fabregas@gmail.com>
---
 fs/ext4/mballoc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a41e3ba8cfaa..4d113efa024c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4794,8 +4794,8 @@ do_more:
 					 " group:%d block:%d count:%lu failed"
 					 " with %d", block_group, bit, count,
 					 err);
-		}
-
+		} else
+			EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
 
 		ext4_lock_group(sb, block_group);
 		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
-- 
cgit v1.3


From 1c1365e374bfadf908eae02cded7abb9e672c9d4 Mon Sep 17 00:00:00 2001
From: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Date: Wed, 30 Oct 2013 14:08:32 +0200
Subject: sysfs: return correct error code on unimplemented mmap()

Both POSIX.1-2008 and Linux Programmer's Manual have a dedicated return
error code for a case, when a file doesn't support mmap(), it's ENODEV.

This change replaces overloaded EINVAL with ENODEV in a situation
described above for sysfs binary files.

Signed-off-by: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 0d7368d43619..382db3c045f3 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -477,7 +477,6 @@ static int sysfs_bin_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!sysfs_get_active(of->sd))
 		goto out_unlock;
 
-	rc = -EINVAL;
 	if (!battr->mmap)
 		goto out_put;
 
-- 
cgit v1.3


From 32c5483a8a13a43264809144210ec114dd70b611 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 29 Oct 2013 22:11:46 +1100
Subject: xfs: abstract the differences in dir2/dir3 via an ops vector

Lots of the dir code now goes through switches to determine what is
the correct on-disk format to parse. It generally involves a
"xfs_sbversion_hasfoo" check, deferencing the superblock version and
feature fields and hence touching several cache lines per operation
in the process. Some operations do multiple checks because they nest
conditional operations and they don't pass the information in a
direct fashion between each other.

Hence, add an ops vector to the xfs_inode structure that is
configured when the inode is initialised to point to all the correct
decode and encoding operations.  This will significantly reduce the
branchiness and cacheline footprint of the directory object decoding
and encoding.

This is the first patch in a series of conversion patches. It will
introduce the ops structure, the setup of it and add the first
operation to the vector. Subsequent patches will convert directory
ops one at a time to keep the changes simple and obvious.

Just this patch shows the benefit of such an approach on code size.
Just converting the two shortform dir operations as this patch does
decreases the built binary size by ~1500 bytes:

$ size fs/xfs/xfs.o.orig fs/xfs/xfs.o.p1
   text    data     bss     dec     hex filename
 794490   96802    1096  892388   d9de4 fs/xfs/xfs.o.orig
 792986   96802    1096  890884   d9804 fs/xfs/xfs.o.p1
$

That's a significant decrease in the instruction cache footprint of
the directory code for such a simple change, and indicates that this
approach is definitely worth pursuing further.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/Makefile           |  1 +
 fs/xfs/xfs_da_btree.h     |  1 +
 fs/xfs/xfs_da_format.c    | 85 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_da_format.h    | 26 ---------------
 fs/xfs/xfs_dir2.c         |  7 ++++
 fs/xfs/xfs_dir2.h         | 14 ++++++++
 fs/xfs/xfs_dir2_block.c   |  2 +-
 fs/xfs/xfs_dir2_readdir.c |  4 +--
 fs/xfs/xfs_dir2_sf.c      | 31 +++++++++--------
 fs/xfs/xfs_inode.h        |  3 ++
 fs/xfs/xfs_iops.c         |  1 +
 fs/xfs/xfs_mount.h        |  2 ++
 12 files changed, 132 insertions(+), 45 deletions(-)
 create mode 100644 fs/xfs/xfs_da_format.c

(limited to 'fs')

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 33a69fabfd83..c21f43506661 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -66,6 +66,7 @@ xfs-y				+= xfs_alloc.o \
 				   xfs_bmap_btree.o \
 				   xfs_btree.o \
 				   xfs_da_btree.o \
+				   xfs_da_format.o \
 				   xfs_dir2.o \
 				   xfs_dir2_block.o \
 				   xfs_dir2_data.o \
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index e492dcadd032..6e95ea79f5d7 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -23,6 +23,7 @@ struct xfs_bmap_free;
 struct xfs_inode;
 struct xfs_trans;
 struct zone;
+struct xfs_dir_ops;
 
 /*========================================================================
  * Btree searching and modification structure definitions.
diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/xfs_da_format.c
new file mode 100644
index 000000000000..982d105d012f
--- /dev/null
+++ b/fs/xfs/xfs_da_format.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+
+
+static int
+xfs_dir2_sf_entsize(
+	struct xfs_dir2_sf_hdr	*hdr,
+	int			len)
+{
+	int count = sizeof(struct xfs_dir2_sf_entry);	/* namelen + offset */
+
+	count += len;					/* name */
+	count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
+				sizeof(xfs_dir2_ino4_t); /* ino # */
+	return count;
+}
+
+static int
+xfs_dir3_sf_entsize(
+	struct xfs_dir2_sf_hdr	*hdr,
+	int			len)
+{
+	return xfs_dir2_sf_entsize(hdr, len) + sizeof(__uint8_t);
+}
+
+static struct xfs_dir2_sf_entry *
+xfs_dir2_sf_nextentry(
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep)
+{
+	return (struct xfs_dir2_sf_entry *)
+		((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen));
+}
+
+static struct xfs_dir2_sf_entry *
+xfs_dir3_sf_nextentry(
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep)
+{
+	return (struct xfs_dir2_sf_entry *)
+		((char *)sfep + xfs_dir3_sf_entsize(hdr, sfep->namelen));
+}
+
+
+const struct xfs_dir_ops xfs_dir2_ops = {
+	.sf_entsize = xfs_dir2_sf_entsize,
+	.sf_nextentry = xfs_dir2_sf_nextentry,
+};
+
+const struct xfs_dir_ops xfs_dir2_ftype_ops = {
+	.sf_entsize = xfs_dir3_sf_entsize,
+	.sf_nextentry = xfs_dir3_sf_nextentry,
+};
+
+const struct xfs_dir_ops xfs_dir3_ops = {
+	.sf_entsize = xfs_dir3_sf_entsize,
+	.sf_nextentry = xfs_dir3_sf_nextentry,
+};
diff --git a/fs/xfs/xfs_da_format.h b/fs/xfs/xfs_da_format.h
index 89a1a219c8ff..d54726d0fc10 100644
--- a/fs/xfs/xfs_da_format.h
+++ b/fs/xfs/xfs_da_format.h
@@ -329,32 +329,6 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
 		((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count));
 }
 
-static inline int
-xfs_dir3_sf_entsize(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_sf_hdr	*hdr,
-	int			len)
-{
-	int count = sizeof(struct xfs_dir2_sf_entry); 	/* namelen + offset */
-
-	count += len;					/* name */
-	count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
-				sizeof(xfs_dir2_ino4_t); /* ino # */
-	if (xfs_sb_version_hasftype(&mp->m_sb))
-		count += sizeof(__uint8_t);		/* file type */
-	return count;
-}
-
-static inline struct xfs_dir2_sf_entry *
-xfs_dir3_sf_nextentry(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_sf_hdr	*hdr,
-	struct xfs_dir2_sf_entry *sfep)
-{
-	return (struct xfs_dir2_sf_entry *)
-		((char *)sfep + xfs_dir3_sf_entsize(mp, hdr, sfep->namelen));
-}
-
 /*
  * in dir3 shortform directories, the file type field is stored at a variable
  * offset after the inode number. Because it's only a single byte, endian
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 38bf9324302c..7911136453cd 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -112,6 +112,13 @@ xfs_dir_mount(
 		mp->m_dirnameops = &xfs_ascii_ci_nameops;
 	else
 		mp->m_dirnameops = &xfs_default_nameops;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		mp->m_dir_inode_ops = &xfs_dir3_ops;
+	else if (xfs_sb_version_hasftype(&mp->m_sb))
+		mp->m_dir_inode_ops = &xfs_dir2_ftype_ops;
+	else
+		mp->m_dir_inode_ops = &xfs_dir2_ops;
 }
 
 /*
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 9910401327d4..1909d9faff71 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -31,6 +31,20 @@ struct xfs_dir2_data_unused;
 
 extern struct xfs_name	xfs_name_dotdot;
 
+/*
+ * directory operations vector for encode/decode routines
+ */
+struct xfs_dir_ops {
+	int	(*sf_entsize)(struct xfs_dir2_sf_hdr *hdr, int len);
+	struct xfs_dir2_sf_entry *
+		(*sf_nextentry)(struct xfs_dir2_sf_hdr *hdr,
+				struct xfs_dir2_sf_entry *sfep);
+};
+
+extern const struct xfs_dir_ops xfs_dir2_ops;
+extern const struct xfs_dir_ops xfs_dir2_ftype_ops;
+extern const struct xfs_dir_ops xfs_dir3_ops;
+
 /*
  * Generic directory interface routines
  */
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 9f3f83a5e2da..9d86b6f9e80f 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -1240,7 +1240,7 @@ xfs_dir2_sf_to_block(
 		if (++i == sfp->count)
 			sfep = NULL;
 		else
-			sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
+			sfep = dp->d_ops->sf_nextentry(sfp, sfep);
 	}
 	/* Done with the temporary buffer */
 	kmem_free(sfp);
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 45c9ce8cdb28..80333055df34 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -153,7 +153,7 @@ xfs_dir2_sf_getdents(
 				xfs_dir2_sf_get_offset(sfep));
 
 		if (ctx->pos > off) {
-			sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
+			sfep = dp->d_ops->sf_nextentry(sfp, sfep);
 			continue;
 		}
 
@@ -163,7 +163,7 @@ xfs_dir2_sf_getdents(
 		if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino,
 			    xfs_dir3_get_dtype(mp, filetype)))
 			return 0;
-		sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
+		sfep = dp->d_ops->sf_nextentry(sfp, sfep);
 	}
 
 	ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 8811ee5eaec6..73881c9f40d6 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -336,7 +336,7 @@ xfs_dir2_block_to_sf(
 			xfs_dir3_sfe_put_ftype(mp, sfp, sfep,
 					xfs_dir3_dirent_get_ftype(mp, dep));
 
-			sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
+			sfep = dp->d_ops->sf_nextentry(sfp, sfep);
 		}
 		ptr += xfs_dir3_data_entsize(mp, dep->namelen);
 	}
@@ -389,7 +389,7 @@ xfs_dir2_sf_addname(
 	/*
 	 * Compute entry (and change in) size.
 	 */
-	add_entsize = xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen);
+	add_entsize = dp->d_ops->sf_entsize(sfp, args->namelen);
 	incr_isize = add_entsize;
 	objchange = 0;
 #if XFS_BIG_INUMS
@@ -483,8 +483,7 @@ xfs_dir2_sf_addname_easy(
 	/*
 	 * Grow the in-inode space.
 	 */
-	xfs_idata_realloc(dp,
-			  xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen),
+	xfs_idata_realloc(dp, dp->d_ops->sf_entsize(sfp, args->namelen),
 			  XFS_DATA_FORK);
 	/*
 	 * Need to set up again due to realloc of the inode data.
@@ -563,7 +562,7 @@ xfs_dir2_sf_addname_hard(
 	      eof = (char *)oldsfep == &buf[old_isize];
 	     !eof;
 	     offset = new_offset + xfs_dir3_data_entsize(mp, oldsfep->namelen),
-	      oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep),
+	      oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep),
 	      eof = (char *)oldsfep == &buf[old_isize]) {
 		new_offset = xfs_dir2_sf_get_offset(oldsfep);
 		if (offset + add_datasize <= new_offset)
@@ -603,7 +602,7 @@ xfs_dir2_sf_addname_hard(
 	 * If there's more left to copy, do that.
 	 */
 	if (!eof) {
-		sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
+		sfep = dp->d_ops->sf_nextentry(sfp, sfep);
 		memcpy(sfep, oldsfep, old_isize - nbytes);
 	}
 	kmem_free(buf);
@@ -653,7 +652,7 @@ xfs_dir2_sf_addname_pick(
 			holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
 		offset = xfs_dir2_sf_get_offset(sfep) +
 			 xfs_dir3_data_entsize(mp, sfep->namelen);
-		sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
+		sfep = dp->d_ops->sf_nextentry(sfp, sfep);
 	}
 	/*
 	 * Calculate data bytes used excluding the new entry, if this
@@ -719,7 +718,7 @@ xfs_dir2_sf_check(
 
 	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
 	     i < sfp->count;
-	     i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep)) {
+	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
 		ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
 		ino = xfs_dir3_sfe_get_ino(mp, sfp, sfep);
 		i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
@@ -848,7 +847,7 @@ xfs_dir2_sf_lookup(
 	 */
 	ci_sfep = NULL;
 	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
-	     i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) {
+	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
 		/*
 		 * Compare name and if it's an exact match, return the inode
 		 * number. If it's the first case-insensitive match, store the
@@ -917,7 +916,7 @@ xfs_dir2_sf_removename(
 	 * Find the one we're deleting.
 	 */
 	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
-	     i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) {
+	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
 		if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
 								XFS_CMP_EXACT) {
 			ASSERT(xfs_dir3_sfe_get_ino(dp->i_mount, sfp, sfep) ==
@@ -934,7 +933,7 @@ xfs_dir2_sf_removename(
 	 * Calculate sizes.
 	 */
 	byteoff = (int)((char *)sfep - (char *)sfp);
-	entsize = xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen);
+	entsize = dp->d_ops->sf_entsize(sfp, args->namelen);
 	newsize = oldsize - entsize;
 	/*
 	 * Copy the part if any after the removed entry, sliding it down.
@@ -1051,7 +1050,7 @@ xfs_dir2_sf_replace(
 	 */
 	else {
 		for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
-		     i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) {
+		     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
 			if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
 								XFS_CMP_EXACT) {
 #if XFS_BIG_INUMS || defined(DEBUG)
@@ -1172,8 +1171,8 @@ xfs_dir2_sf_toino4(
 	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
 		    oldsfep = xfs_dir2_sf_firstentry(oldsfp);
 	     i < sfp->count;
-	     i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep),
-		  oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep)) {
+	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
+		  oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
 		sfep->namelen = oldsfep->namelen;
 		sfep->offset = oldsfep->offset;
 		memcpy(sfep->name, oldsfep->name, sfep->namelen);
@@ -1251,8 +1250,8 @@ xfs_dir2_sf_toino8(
 	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
 		    oldsfep = xfs_dir2_sf_firstentry(oldsfp);
 	     i < sfp->count;
-	     i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep),
-		  oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep)) {
+	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
+		  oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
 		sfep->namelen = oldsfep->namelen;
 		sfep->offset = oldsfep->offset;
 		memcpy(sfep->name, oldsfep->name, sfep->namelen);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 66675877f38c..9e6efccbae04 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -49,6 +49,9 @@ typedef struct xfs_inode {
 	xfs_ifork_t		*i_afp;		/* attribute fork pointer */
 	xfs_ifork_t		i_df;		/* data fork */
 
+	/* operations vectors */
+	const struct xfs_dir_ops *d_ops;		/* directory ops vector */
+
 	/* Transaction and locking information. */
 	struct xfs_inode_log_item *i_itemp;	/* logging information */
 	mrlock_t		i_lock;		/* inode lock */
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 718b62b0fe05..0493587ea6bc 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1215,6 +1215,7 @@ xfs_setup_inode(
 		else
 			inode->i_op = &xfs_dir_inode_operations;
 		inode->i_fop = &xfs_dir_file_operations;
+		ip->d_ops = ip->i_mount->m_dir_inode_ops;
 		break;
 	case S_IFLNK:
 		inode->i_op = &xfs_symlink_inode_operations;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1fa0584b5627..973397f66c6b 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -26,6 +26,7 @@ struct xfs_mru_cache;
 struct xfs_nameops;
 struct xfs_ail;
 struct xfs_quotainfo;
+struct xfs_dir_ops;
 
 #ifdef HAVE_PERCPU_SB
 
@@ -148,6 +149,7 @@ typedef struct xfs_mount {
 	int			m_dir_magicpct;	/* 37% of the dir blocksize */
 	__uint8_t		m_sectbb_log;	/* sectlog - BBSHIFT */
 	const struct xfs_nameops *m_dirnameops;	/* vector of dir name ops */
+	const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */
 	int			m_dirblksize;	/* directory block sz--bytes */
 	int			m_dirblkfsbs;	/* directory block sz--fsbs */
 	xfs_dablk_t		m_dirdatablk;	/* blockno of dir data v2 */
-- 
cgit v1.3


From 4740175e75f70ab71f76ae98ab00f7db731a48f7 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 29 Oct 2013 22:11:47 +1100
Subject: xfs: vectorise remaining shortform dir2 ops

Following from the initial patch to introduce the directory
operations vector, convert the rest of the shortform directory
operations to use vectored ops rather than superblock feature
checks. This further reduces the size of the built binary:

   text    data     bss     dec     hex filename
 794490   96802    1096  892388   d9de4 fs/xfs/xfs.o.orig
 792986   96802    1096  890884   d9804 fs/xfs/xfs.o.p1
 792350   96802    1096  890248   d9588 fs/xfs/xfs.o.p2

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_da_format.c    | 152 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_da_format.h    |  47 --------------
 fs/xfs/xfs_dir2.h         |  20 +++---
 fs/xfs/xfs_dir2_block.c   |   6 +-
 fs/xfs/xfs_dir2_readdir.c |   6 +-
 fs/xfs/xfs_dir2_sf.c      | 151 +++++++++------------------------------------
 6 files changed, 199 insertions(+), 183 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/xfs_da_format.c
index 982d105d012f..62f55a0186ae 100644
--- a/fs/xfs/xfs_da_format.c
+++ b/fs/xfs/xfs_da_format.c
@@ -69,17 +69,169 @@ xfs_dir3_sf_nextentry(
 }
 
 
+/*
+ * For filetype enabled shortform directories, the file type field is stored at
+ * the end of the name.  Because it's only a single byte, endian conversion is
+ * not necessary. For non-filetype enable directories, the type is always
+ * unknown and we never store the value.
+ */
+static __uint8_t
+xfs_dir2_sfe_get_ftype(
+	struct xfs_dir2_sf_entry *sfep)
+{
+	return XFS_DIR3_FT_UNKNOWN;
+}
+
+static void
+xfs_dir2_sfe_put_ftype(
+	struct xfs_dir2_sf_entry *sfep,
+	__uint8_t		ftype)
+{
+	ASSERT(ftype < XFS_DIR3_FT_MAX);
+}
+
+static __uint8_t
+xfs_dir3_sfe_get_ftype(
+	struct xfs_dir2_sf_entry *sfep)
+{
+	__uint8_t	ftype;
+
+	ftype = sfep->name[sfep->namelen];
+	if (ftype >= XFS_DIR3_FT_MAX)
+		return XFS_DIR3_FT_UNKNOWN;
+	return ftype;
+}
+
+static void
+xfs_dir3_sfe_put_ftype(
+	struct xfs_dir2_sf_entry *sfep,
+	__uint8_t		ftype)
+{
+	ASSERT(ftype < XFS_DIR3_FT_MAX);
+
+	sfep->name[sfep->namelen] = ftype;
+}
+
+/*
+ * Inode numbers in short-form directories can come in two versions,
+ * either 4 bytes or 8 bytes wide.  These helpers deal with the
+ * two forms transparently by looking at the headers i8count field.
+ *
+ * For 64-bit inode number the most significant byte must be zero.
+ */
+static xfs_ino_t
+xfs_dir2_sf_get_ino(
+	struct xfs_dir2_sf_hdr	*hdr,
+	xfs_dir2_inou_t		*from)
+{
+	if (hdr->i8count)
+		return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL;
+	else
+		return get_unaligned_be32(&from->i4.i);
+}
+
+static void
+xfs_dir2_sf_put_ino(
+	struct xfs_dir2_sf_hdr	*hdr,
+	xfs_dir2_inou_t		*to,
+	xfs_ino_t		ino)
+{
+	ASSERT((ino & 0xff00000000000000ULL) == 0);
+
+	if (hdr->i8count)
+		put_unaligned_be64(ino, &to->i8.i);
+	else
+		put_unaligned_be32(ino, &to->i4.i);
+}
+
+static xfs_ino_t
+xfs_dir2_sf_get_parent_ino(
+	struct xfs_dir2_sf_hdr	*hdr)
+{
+	return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
+}
+
+static void
+xfs_dir2_sf_put_parent_ino(
+	struct xfs_dir2_sf_hdr	*hdr,
+	xfs_ino_t		ino)
+{
+	xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino);
+}
+
+/*
+ * In short-form directory entries the inode numbers are stored at variable
+ * offset behind the entry name. If the entry stores a filetype value, then it
+ * sits between the name and the inode number. Hence the inode numbers may only
+ * be accessed through the helpers below.
+ */
+static xfs_ino_t
+xfs_dir2_sfe_get_ino(
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep)
+{
+	return xfs_dir2_sf_get_ino(hdr,
+				(xfs_dir2_inou_t *)&sfep->name[sfep->namelen]);
+}
+
+static void
+xfs_dir2_sfe_put_ino(
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep,
+	xfs_ino_t		ino)
+{
+	xfs_dir2_sf_put_ino(hdr,
+			    (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino);
+}
+
+static xfs_ino_t
+xfs_dir3_sfe_get_ino(
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep)
+{
+	return xfs_dir2_sf_get_ino(hdr,
+			(xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]);
+}
+
+static void
+xfs_dir3_sfe_put_ino(
+	struct xfs_dir2_sf_hdr	*hdr,
+	struct xfs_dir2_sf_entry *sfep,
+	xfs_ino_t		ino)
+{
+	xfs_dir2_sf_put_ino(hdr,
+			(xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino);
+}
+
 const struct xfs_dir_ops xfs_dir2_ops = {
 	.sf_entsize = xfs_dir2_sf_entsize,
 	.sf_nextentry = xfs_dir2_sf_nextentry,
+	.sf_get_ftype = xfs_dir2_sfe_get_ftype,
+	.sf_put_ftype = xfs_dir2_sfe_put_ftype,
+	.sf_get_ino = xfs_dir2_sfe_get_ino,
+	.sf_put_ino = xfs_dir2_sfe_put_ino,
+	.sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+	.sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
 };
 
 const struct xfs_dir_ops xfs_dir2_ftype_ops = {
 	.sf_entsize = xfs_dir3_sf_entsize,
 	.sf_nextentry = xfs_dir3_sf_nextentry,
+	.sf_get_ftype = xfs_dir3_sfe_get_ftype,
+	.sf_put_ftype = xfs_dir3_sfe_put_ftype,
+	.sf_get_ino = xfs_dir3_sfe_get_ino,
+	.sf_put_ino = xfs_dir3_sfe_put_ino,
+	.sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+	.sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
 };
 
 const struct xfs_dir_ops xfs_dir3_ops = {
 	.sf_entsize = xfs_dir3_sf_entsize,
 	.sf_nextentry = xfs_dir3_sf_nextentry,
+	.sf_get_ftype = xfs_dir3_sfe_get_ftype,
+	.sf_put_ftype = xfs_dir3_sfe_put_ftype,
+	.sf_get_ino = xfs_dir3_sfe_get_ino,
+	.sf_put_ino = xfs_dir3_sfe_put_ino,
+	.sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+	.sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
 };
diff --git a/fs/xfs/xfs_da_format.h b/fs/xfs/xfs_da_format.h
index d54726d0fc10..68c2ad5ba54f 100644
--- a/fs/xfs/xfs_da_format.h
+++ b/fs/xfs/xfs_da_format.h
@@ -329,53 +329,6 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
 		((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count));
 }
 
-/*
- * in dir3 shortform directories, the file type field is stored at a variable
- * offset after the inode number. Because it's only a single byte, endian
- * conversion is not necessary.
- */
-static inline __uint8_t *
-xfs_dir3_sfe_ftypep(
-	struct xfs_dir2_sf_hdr	*hdr,
-	struct xfs_dir2_sf_entry *sfep)
-{
-	return (__uint8_t *)&sfep->name[sfep->namelen];
-}
-
-static inline __uint8_t
-xfs_dir3_sfe_get_ftype(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_sf_hdr	*hdr,
-	struct xfs_dir2_sf_entry *sfep)
-{
-	__uint8_t	*ftp;
-
-	if (!xfs_sb_version_hasftype(&mp->m_sb))
-		return XFS_DIR3_FT_UNKNOWN;
-
-	ftp = xfs_dir3_sfe_ftypep(hdr, sfep);
-	if (*ftp >= XFS_DIR3_FT_MAX)
-		return XFS_DIR3_FT_UNKNOWN;
-	return *ftp;
-}
-
-static inline void
-xfs_dir3_sfe_put_ftype(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_sf_hdr	*hdr,
-	struct xfs_dir2_sf_entry *sfep,
-	__uint8_t		ftype)
-{
-	__uint8_t	*ftp;
-
-	ASSERT(ftype < XFS_DIR3_FT_MAX);
-
-	if (!xfs_sb_version_hasftype(&mp->m_sb))
-		return;
-	ftp = xfs_dir3_sfe_ftypep(hdr, sfep);
-	*ftp = ftype;
-}
-
 /*
  * Data block structures.
  *
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 1909d9faff71..0e94b3e662af 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -39,6 +39,17 @@ struct xfs_dir_ops {
 	struct xfs_dir2_sf_entry *
 		(*sf_nextentry)(struct xfs_dir2_sf_hdr *hdr,
 				struct xfs_dir2_sf_entry *sfep);
+	__uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep);
+	void	(*sf_put_ftype)(struct xfs_dir2_sf_entry *sfep,
+				__uint8_t ftype);
+	xfs_ino_t (*sf_get_ino)(struct xfs_dir2_sf_hdr *hdr,
+				struct xfs_dir2_sf_entry *sfep);
+	void	(*sf_put_ino)(struct xfs_dir2_sf_hdr *hdr,
+			      struct xfs_dir2_sf_entry *sfep,
+			      xfs_ino_t ino);
+	xfs_ino_t (*sf_get_parent_ino)(struct xfs_dir2_sf_hdr *hdr);
+	void	(*sf_put_parent_ino)(struct xfs_dir2_sf_hdr *hdr,
+				     xfs_ino_t ino);
 };
 
 extern const struct xfs_dir_ops xfs_dir2_ops;
@@ -79,15 +90,6 @@ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
 /*
  * Interface routines used by userspace utilities
  */
-extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp);
-extern void xfs_dir2_sf_put_parent_ino(struct xfs_dir2_sf_hdr *sfp,
-		xfs_ino_t ino);
-extern xfs_ino_t xfs_dir3_sfe_get_ino(struct xfs_mount *mp,
-		struct xfs_dir2_sf_hdr *sfp, struct xfs_dir2_sf_entry *sfep);
-extern void xfs_dir3_sfe_put_ino(struct xfs_mount *mp,
-		struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep,
-		xfs_ino_t ino);
-
 extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
 extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
 extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 9d86b6f9e80f..960f3ab526f6 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -1173,7 +1173,7 @@ xfs_dir2_sf_to_block(
 	 * Create entry for ..
 	 */
 	dep = xfs_dir3_data_dotdot_entry_p(mp, hdr);
-	dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp));
+	dep->inumber = cpu_to_be64(dp->d_ops->sf_get_parent_ino(sfp));
 	dep->namelen = 2;
 	dep->name[0] = dep->name[1] = '.';
 	xfs_dir3_dirent_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
@@ -1222,10 +1222,10 @@ xfs_dir2_sf_to_block(
 		 * Copy a real entry.
 		 */
 		dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset);
-		dep->inumber = cpu_to_be64(xfs_dir3_sfe_get_ino(mp, sfp, sfep));
+		dep->inumber = cpu_to_be64(dp->d_ops->sf_get_ino(sfp, sfep));
 		dep->namelen = sfep->namelen;
 		xfs_dir3_dirent_put_ftype(mp, dep,
-					xfs_dir3_sfe_get_ftype(mp, sfp, sfep));
+					  dp->d_ops->sf_get_ftype(sfep));
 		memcpy(dep->name, sfep->name, dep->namelen);
 		tagp = xfs_dir3_data_entry_tag_p(mp, dep);
 		*tagp = cpu_to_be16((char *)dep - (char *)hdr);
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 80333055df34..2d2c8fb05541 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -136,7 +136,7 @@ xfs_dir2_sf_getdents(
 	 * Put .. entry unless we're starting past it.
 	 */
 	if (ctx->pos <= dotdot_offset) {
-		ino = xfs_dir2_sf_get_parent_ino(sfp);
+		ino = dp->d_ops->sf_get_parent_ino(sfp);
 		ctx->pos = dotdot_offset & 0x7fffffff;
 		if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
 			return 0;
@@ -157,8 +157,8 @@ xfs_dir2_sf_getdents(
 			continue;
 		}
 
-		ino = xfs_dir3_sfe_get_ino(mp, sfp, sfep);
-		filetype = xfs_dir3_sfe_get_ftype(mp, sfp, sfep);
+		ino = dp->d_ops->sf_get_ino(sfp, sfep);
+		filetype = dp->d_ops->sf_get_ftype(sfep);
 		ctx->pos = off & 0x7fffffff;
 		if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino,
 			    xfs_dir3_get_dtype(mp, filetype)))
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 73881c9f40d6..ec0d39b5fa12 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -56,89 +56,6 @@ static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
 static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
 #endif /* XFS_BIG_INUMS */
 
-/*
- * Inode numbers in short-form directories can come in two versions,
- * either 4 bytes or 8 bytes wide.  These helpers deal with the
- * two forms transparently by looking at the headers i8count field.
- *
- * For 64-bit inode number the most significant byte must be zero.
- */
-static xfs_ino_t
-xfs_dir2_sf_get_ino(
-	struct xfs_dir2_sf_hdr	*hdr,
-	xfs_dir2_inou_t		*from)
-{
-	if (hdr->i8count)
-		return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL;
-	else
-		return get_unaligned_be32(&from->i4.i);
-}
-
-static void
-xfs_dir2_sf_put_ino(
-	struct xfs_dir2_sf_hdr	*hdr,
-	xfs_dir2_inou_t		*to,
-	xfs_ino_t		ino)
-{
-	ASSERT((ino & 0xff00000000000000ULL) == 0);
-
-	if (hdr->i8count)
-		put_unaligned_be64(ino, &to->i8.i);
-	else
-		put_unaligned_be32(ino, &to->i4.i);
-}
-
-xfs_ino_t
-xfs_dir2_sf_get_parent_ino(
-	struct xfs_dir2_sf_hdr	*hdr)
-{
-	return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
-}
-
-void
-xfs_dir2_sf_put_parent_ino(
-	struct xfs_dir2_sf_hdr	*hdr,
-	xfs_ino_t		ino)
-{
-	xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino);
-}
-
-/*
- * In short-form directory entries the inode numbers are stored at variable
- * offset behind the entry name. If the entry stores a filetype value, then it
- * sits between the name and the inode number. Hence the inode numbers may only
- * be accessed through the helpers below.
- */
-static xfs_dir2_inou_t *
-xfs_dir3_sfe_inop(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_sf_entry *sfep)
-{
-	__uint8_t	*ptr = &sfep->name[sfep->namelen];
-	if (xfs_sb_version_hasftype(&mp->m_sb))
-		ptr++;
-	return (xfs_dir2_inou_t *)ptr;
-}
-
-xfs_ino_t
-xfs_dir3_sfe_get_ino(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_sf_hdr	*hdr,
-	struct xfs_dir2_sf_entry *sfep)
-{
-	return xfs_dir2_sf_get_ino(hdr, xfs_dir3_sfe_inop(mp, sfep));
-}
-
-void
-xfs_dir3_sfe_put_ino(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_sf_hdr	*hdr,
-	struct xfs_dir2_sf_entry *sfep,
-	xfs_ino_t		ino)
-{
-	xfs_dir2_sf_put_ino(hdr, xfs_dir3_sfe_inop(mp, sfep), ino);
-}
-
 /*
  * Given a block directory (dp/block), calculate its size as a shortform (sf)
  * directory and a header for the sf directory, if it will fit it the
@@ -226,7 +143,7 @@ xfs_dir2_block_sfsize(
 	 */
 	sfhp->count = count;
 	sfhp->i8count = i8count;
-	xfs_dir2_sf_put_parent_ino(sfhp, parent);
+	dp->d_ops->sf_put_parent_ino(sfhp, parent);
 	return size;
 }
 
@@ -321,7 +238,7 @@ xfs_dir2_block_to_sf(
 		else if (dep->namelen == 2 &&
 			 dep->name[0] == '.' && dep->name[1] == '.')
 			ASSERT(be64_to_cpu(dep->inumber) ==
-			       xfs_dir2_sf_get_parent_ino(sfp));
+			       dp->d_ops->sf_get_parent_ino(sfp));
 		/*
 		 * Normal entry, copy it into shortform.
 		 */
@@ -331,9 +248,9 @@ xfs_dir2_block_to_sf(
 				(xfs_dir2_data_aoff_t)
 				((char *)dep - (char *)hdr));
 			memcpy(sfep->name, dep->name, dep->namelen);
-			xfs_dir3_sfe_put_ino(mp, sfp, sfep,
-					     be64_to_cpu(dep->inumber));
-			xfs_dir3_sfe_put_ftype(mp, sfp, sfep,
+			dp->d_ops->sf_put_ino(sfp, sfep,
+					      be64_to_cpu(dep->inumber));
+			dp->d_ops->sf_put_ftype(sfep,
 					xfs_dir3_dirent_get_ftype(mp, dep));
 
 			sfep = dp->d_ops->sf_nextentry(sfp, sfep);
@@ -496,8 +413,8 @@ xfs_dir2_sf_addname_easy(
 	sfep->namelen = args->namelen;
 	xfs_dir2_sf_put_offset(sfep, offset);
 	memcpy(sfep->name, args->name, sfep->namelen);
-	xfs_dir3_sfe_put_ino(dp->i_mount, sfp, sfep, args->inumber);
-	xfs_dir3_sfe_put_ftype(dp->i_mount, sfp, sfep, args->filetype);
+	dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+	dp->d_ops->sf_put_ftype(sfep, args->filetype);
 
 	/*
 	 * Update the header and inode.
@@ -591,8 +508,8 @@ xfs_dir2_sf_addname_hard(
 	sfep->namelen = args->namelen;
 	xfs_dir2_sf_put_offset(sfep, offset);
 	memcpy(sfep->name, args->name, sfep->namelen);
-	xfs_dir3_sfe_put_ino(mp, sfp, sfep, args->inumber);
-	xfs_dir3_sfe_put_ftype(mp, sfp, sfep, args->filetype);
+	dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+	dp->d_ops->sf_put_ftype(sfep, args->filetype);
 	sfp->count++;
 #if XFS_BIG_INUMS
 	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
@@ -713,20 +630,19 @@ xfs_dir2_sf_check(
 
 	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
 	offset = xfs_dir3_data_first_offset(mp);
-	ino = xfs_dir2_sf_get_parent_ino(sfp);
+	ino = dp->d_ops->sf_get_parent_ino(sfp);
 	i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
 
 	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
 	     i < sfp->count;
 	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
 		ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
-		ino = xfs_dir3_sfe_get_ino(mp, sfp, sfep);
+		ino = dp->d_ops->sf_get_ino(sfp, sfep);
 		i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
 		offset =
 			xfs_dir2_sf_get_offset(sfep) +
 			xfs_dir3_data_entsize(mp, sfep->namelen);
-		ASSERT(xfs_dir3_sfe_get_ftype(mp, sfp, sfep) <
-							XFS_DIR3_FT_MAX);
+		ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX);
 	}
 	ASSERT(i8count == sfp->i8count);
 	ASSERT(XFS_BIG_INUMS || i8count == 0);
@@ -782,7 +698,7 @@ xfs_dir2_sf_create(
 	/*
 	 * Now can put in the inode number, since i8count is set.
 	 */
-	xfs_dir2_sf_put_parent_ino(sfp, pino);
+	dp->d_ops->sf_put_parent_ino(sfp, pino);
 	sfp->count = 0;
 	dp->i_d.di_size = size;
 	xfs_dir2_sf_check(args);
@@ -837,7 +753,7 @@ xfs_dir2_sf_lookup(
 	 */
 	if (args->namelen == 2 &&
 	    args->name[0] == '.' && args->name[1] == '.') {
-		args->inumber = xfs_dir2_sf_get_parent_ino(sfp);
+		args->inumber = dp->d_ops->sf_get_parent_ino(sfp);
 		args->cmpresult = XFS_CMP_EXACT;
 		args->filetype = XFS_DIR3_FT_DIR;
 		return XFS_ERROR(EEXIST);
@@ -857,10 +773,8 @@ xfs_dir2_sf_lookup(
 								sfep->namelen);
 		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
 			args->cmpresult = cmp;
-			args->inumber = xfs_dir3_sfe_get_ino(dp->i_mount,
-							     sfp, sfep);
-			args->filetype = xfs_dir3_sfe_get_ftype(dp->i_mount,
-								sfp, sfep);
+			args->inumber = dp->d_ops->sf_get_ino(sfp, sfep);
+			args->filetype = dp->d_ops->sf_get_ftype(sfep);
 			if (cmp == XFS_CMP_EXACT)
 				return XFS_ERROR(EEXIST);
 			ci_sfep = sfep;
@@ -919,7 +833,7 @@ xfs_dir2_sf_removename(
 	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
 		if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
 								XFS_CMP_EXACT) {
-			ASSERT(xfs_dir3_sfe_get_ino(dp->i_mount, sfp, sfep) ==
+			ASSERT(dp->d_ops->sf_get_ino(sfp, sfep) ==
 			       args->inumber);
 			break;
 		}
@@ -1040,10 +954,10 @@ xfs_dir2_sf_replace(
 	if (args->namelen == 2 &&
 	    args->name[0] == '.' && args->name[1] == '.') {
 #if XFS_BIG_INUMS || defined(DEBUG)
-		ino = xfs_dir2_sf_get_parent_ino(sfp);
+		ino = dp->d_ops->sf_get_parent_ino(sfp);
 		ASSERT(args->inumber != ino);
 #endif
-		xfs_dir2_sf_put_parent_ino(sfp, args->inumber);
+		dp->d_ops->sf_put_parent_ino(sfp, args->inumber);
 	}
 	/*
 	 * Normal entry, look for the name.
@@ -1054,14 +968,11 @@ xfs_dir2_sf_replace(
 			if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
 								XFS_CMP_EXACT) {
 #if XFS_BIG_INUMS || defined(DEBUG)
-				ino = xfs_dir3_sfe_get_ino(dp->i_mount,
-							   sfp, sfep);
+				ino = dp->d_ops->sf_get_ino(sfp, sfep);
 				ASSERT(args->inumber != ino);
 #endif
-				xfs_dir3_sfe_put_ino(dp->i_mount, sfp, sfep,
-						     args->inumber);
-				xfs_dir3_sfe_put_ftype(dp->i_mount, sfp, sfep,
-						       args->filetype);
+				dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+				dp->d_ops->sf_put_ftype(sfep, args->filetype);
 				break;
 			}
 		}
@@ -1164,7 +1075,7 @@ xfs_dir2_sf_toino4(
 	 */
 	sfp->count = oldsfp->count;
 	sfp->i8count = 0;
-	xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp));
+	dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
 	/*
 	 * Copy the entries field by field.
 	 */
@@ -1176,10 +1087,9 @@ xfs_dir2_sf_toino4(
 		sfep->namelen = oldsfep->namelen;
 		sfep->offset = oldsfep->offset;
 		memcpy(sfep->name, oldsfep->name, sfep->namelen);
-		xfs_dir3_sfe_put_ino(mp, sfp, sfep,
-			xfs_dir3_sfe_get_ino(mp, oldsfp, oldsfep));
-		xfs_dir3_sfe_put_ftype(mp, sfp, sfep,
-			xfs_dir3_sfe_get_ftype(mp, oldsfp, oldsfep));
+		dp->d_ops->sf_put_ino(sfp, sfep,
+				      dp->d_ops->sf_get_ino(oldsfp, oldsfep));
+		dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
 	}
 	/*
 	 * Clean up the inode.
@@ -1243,7 +1153,7 @@ xfs_dir2_sf_toino8(
 	 */
 	sfp->count = oldsfp->count;
 	sfp->i8count = 1;
-	xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp));
+	dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
 	/*
 	 * Copy the entries field by field.
 	 */
@@ -1255,10 +1165,9 @@ xfs_dir2_sf_toino8(
 		sfep->namelen = oldsfep->namelen;
 		sfep->offset = oldsfep->offset;
 		memcpy(sfep->name, oldsfep->name, sfep->namelen);
-		xfs_dir3_sfe_put_ino(mp, sfp, sfep,
-			xfs_dir3_sfe_get_ino(mp, oldsfp, oldsfep));
-		xfs_dir3_sfe_put_ftype(mp, sfp, sfep,
-			xfs_dir3_sfe_get_ftype(mp, oldsfp, oldsfep));
+		dp->d_ops->sf_put_ino(sfp, sfep,
+				      dp->d_ops->sf_get_ino(oldsfp, oldsfep));
+		dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
 	}
 	/*
 	 * Clean up the inode.
-- 
cgit v1.3


From 9d23fc8575de7012b8853bd6fefe10534665de2f Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 29 Oct 2013 22:11:48 +1100
Subject: xfs: vectorise directory data operations

Following from the initial patches to vectorise the shortform
directory encode/decode operations, convert half the data block
operations to use the vector. The rest will be done in a second
patch.

This further reduces the size of the built binary:

   text    data     bss     dec     hex filename
 794490   96802    1096  892388   d9de4 fs/xfs/xfs.o.orig
 792986   96802    1096  890884   d9804 fs/xfs/xfs.o.p1
 792350   96802    1096  890248   d9588 fs/xfs/xfs.o.p2
 789293   96802    1096  887191   d8997 fs/xfs/xfs.o.p3

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_da_format.c    | 218 +++++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_da_format.h    | 123 --------------------------
 fs/xfs/xfs_dir2.h         |  22 ++++-
 fs/xfs/xfs_dir2_block.c   |  54 ++++++------
 fs/xfs/xfs_dir2_data.c    |  38 +++++---
 fs/xfs/xfs_dir2_leaf.c    |  22 ++---
 fs/xfs/xfs_dir2_node.c    |  23 ++---
 fs/xfs/xfs_dir2_readdir.c |  14 +--
 fs/xfs/xfs_dir2_sf.c      |  20 ++---
 9 files changed, 329 insertions(+), 205 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/xfs_da_format.c
index 62f55a0186ae..d0285fdfb6e7 100644
--- a/fs/xfs/xfs_da_format.c
+++ b/fs/xfs/xfs_da_format.c
@@ -28,7 +28,9 @@
 #include "xfs_inode.h"
 #include "xfs_dir2.h"
 
-
+/*
+ * Shortform directory ops
+ */
 static int
 xfs_dir2_sf_entsize(
 	struct xfs_dir2_sf_hdr	*hdr,
@@ -203,6 +205,184 @@ xfs_dir3_sfe_put_ino(
 			(xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino);
 }
 
+
+/*
+ * Directory data block operations
+ */
+static int
+__xfs_dir3_data_entsize(
+	bool	ftype,
+	int	n)
+{
+	int	size = offsetof(struct xfs_dir2_data_entry, name[0]);
+
+	size += n;
+	size += sizeof(xfs_dir2_data_off_t);
+	if (ftype)
+		size += sizeof(__uint8_t);
+	return roundup(size, XFS_DIR2_DATA_ALIGN);
+}
+
+static int
+xfs_dir2_data_entsize(
+	int			n)
+{
+	return __xfs_dir3_data_entsize(false, n);
+}
+static int
+xfs_dir3_data_entsize(
+	int			n)
+{
+	return __xfs_dir3_data_entsize(true, n);
+}
+
+static __uint8_t
+xfs_dir2_data_get_ftype(
+	struct xfs_dir2_data_entry *dep)
+{
+	return XFS_DIR3_FT_UNKNOWN;
+}
+
+static void
+xfs_dir2_data_put_ftype(
+	struct xfs_dir2_data_entry *dep,
+	__uint8_t		ftype)
+{
+	ASSERT(ftype < XFS_DIR3_FT_MAX);
+}
+
+static __uint8_t
+xfs_dir3_data_get_ftype(
+	struct xfs_dir2_data_entry *dep)
+{
+	__uint8_t	ftype = dep->name[dep->namelen];
+
+	ASSERT(ftype < XFS_DIR3_FT_MAX);
+	if (ftype >= XFS_DIR3_FT_MAX)
+		return XFS_DIR3_FT_UNKNOWN;
+	return ftype;
+}
+
+static void
+xfs_dir3_data_put_ftype(
+	struct xfs_dir2_data_entry *dep,
+	__uint8_t		type)
+{
+	ASSERT(type < XFS_DIR3_FT_MAX);
+	ASSERT(dep->namelen != 0);
+
+	dep->name[dep->namelen] = type;
+}
+
+/*
+ * Pointer to an entry's tag word.
+ */
+static __be16 *
+xfs_dir2_data_entry_tag_p(
+	struct xfs_dir2_data_entry *dep)
+{
+	return (__be16 *)((char *)dep +
+		xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16));
+}
+
+static __be16 *
+xfs_dir3_data_entry_tag_p(
+	struct xfs_dir2_data_entry *dep)
+{
+	return (__be16 *)((char *)dep +
+		xfs_dir3_data_entsize(dep->namelen) - sizeof(__be16));
+}
+
+/*
+ * Offsets of . and .. in data space (always block 0)
+ */
+static xfs_dir2_data_aoff_t
+xfs_dir2_data_dot_offset(void)
+{
+	return sizeof(struct xfs_dir2_data_hdr);
+}
+
+static xfs_dir2_data_aoff_t
+xfs_dir2_data_dotdot_offset(void)
+{
+	return xfs_dir2_data_dot_offset() + xfs_dir2_data_entsize(1);
+}
+
+static xfs_dir2_data_aoff_t
+xfs_dir2_data_first_offset(void)
+{
+	return xfs_dir2_data_dotdot_offset() + xfs_dir2_data_entsize(2);
+}
+
+static xfs_dir2_data_aoff_t
+xfs_dir3_data_dot_offset(void)
+{
+	return sizeof(struct xfs_dir3_data_hdr);
+}
+
+static xfs_dir2_data_aoff_t
+xfs_dir3_data_dotdot_offset(void)
+{
+	return xfs_dir3_data_dot_offset() + xfs_dir3_data_entsize(1);
+}
+
+static xfs_dir2_data_aoff_t
+xfs_dir3_data_first_offset(void)
+{
+	return xfs_dir3_data_dotdot_offset() + xfs_dir3_data_entsize(2);
+}
+
+/*
+ * location of . and .. in data space (always block 0)
+ */
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_dot_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + xfs_dir2_data_dot_offset());
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_dotdot_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + xfs_dir2_data_dotdot_offset());
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_first_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + xfs_dir2_data_first_offset());
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_dot_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + xfs_dir3_data_dot_offset());
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_dotdot_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + xfs_dir3_data_dotdot_offset());
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_first_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + xfs_dir3_data_first_offset());
+}
+
 const struct xfs_dir_ops xfs_dir2_ops = {
 	.sf_entsize = xfs_dir2_sf_entsize,
 	.sf_nextentry = xfs_dir2_sf_nextentry,
@@ -212,6 +392,18 @@ const struct xfs_dir_ops xfs_dir2_ops = {
 	.sf_put_ino = xfs_dir2_sfe_put_ino,
 	.sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
 	.sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+
+	.data_entsize = xfs_dir2_data_entsize,
+	.data_get_ftype = xfs_dir2_data_get_ftype,
+	.data_put_ftype = xfs_dir2_data_put_ftype,
+	.data_entry_tag_p = xfs_dir2_data_entry_tag_p,
+
+	.data_dot_offset = xfs_dir2_data_dot_offset,
+	.data_dotdot_offset = xfs_dir2_data_dotdot_offset,
+	.data_first_offset = xfs_dir2_data_first_offset,
+	.data_dot_entry_p = xfs_dir2_data_dot_entry_p,
+	.data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p,
+	.data_first_entry_p = xfs_dir2_data_first_entry_p,
 };
 
 const struct xfs_dir_ops xfs_dir2_ftype_ops = {
@@ -223,6 +415,18 @@ const struct xfs_dir_ops xfs_dir2_ftype_ops = {
 	.sf_put_ino = xfs_dir3_sfe_put_ino,
 	.sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
 	.sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+
+	.data_entsize = xfs_dir3_data_entsize,
+	.data_get_ftype = xfs_dir3_data_get_ftype,
+	.data_put_ftype = xfs_dir3_data_put_ftype,
+	.data_entry_tag_p = xfs_dir3_data_entry_tag_p,
+
+	.data_dot_offset = xfs_dir2_data_dot_offset,
+	.data_dotdot_offset = xfs_dir2_data_dotdot_offset,
+	.data_first_offset = xfs_dir2_data_first_offset,
+	.data_dot_entry_p = xfs_dir2_data_dot_entry_p,
+	.data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p,
+	.data_first_entry_p = xfs_dir2_data_first_entry_p,
 };
 
 const struct xfs_dir_ops xfs_dir3_ops = {
@@ -234,4 +438,16 @@ const struct xfs_dir_ops xfs_dir3_ops = {
 	.sf_put_ino = xfs_dir3_sfe_put_ino,
 	.sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
 	.sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+
+	.data_entsize = xfs_dir3_data_entsize,
+	.data_get_ftype = xfs_dir3_data_get_ftype,
+	.data_put_ftype = xfs_dir3_data_put_ftype,
+	.data_entry_tag_p = xfs_dir3_data_entry_tag_p,
+
+	.data_dot_offset = xfs_dir3_data_dot_offset,
+	.data_dotdot_offset = xfs_dir3_data_dotdot_offset,
+	.data_first_offset = xfs_dir3_data_first_offset,
+	.data_dot_entry_p = xfs_dir3_data_dot_entry_p,
+	.data_dotdot_entry_p = xfs_dir3_data_dotdot_entry_p,
+	.data_first_entry_p = xfs_dir3_data_first_entry_p,
 };
diff --git a/fs/xfs/xfs_da_format.h b/fs/xfs/xfs_da_format.h
index 68c2ad5ba54f..5b72dd2f8a69 100644
--- a/fs/xfs/xfs_da_format.h
+++ b/fs/xfs/xfs_da_format.h
@@ -455,72 +455,6 @@ typedef struct xfs_dir2_data_unused {
 	__be16			tag;		/* starting offset of us */
 } xfs_dir2_data_unused_t;
 
-/*
- * Size of a data entry.
- */
-static inline int
-__xfs_dir3_data_entsize(
-	bool	ftype,
-	int	n)
-{
-	int	size = offsetof(struct xfs_dir2_data_entry, name[0]);
-
-	size += n;
-	size += sizeof(xfs_dir2_data_off_t);
-	if (ftype)
-		size += sizeof(__uint8_t);
-	return roundup(size, XFS_DIR2_DATA_ALIGN);
-}
-static inline int
-xfs_dir3_data_entsize(
-	struct xfs_mount	*mp,
-	int			n)
-{
-	bool ftype = xfs_sb_version_hasftype(&mp->m_sb) ? true : false;
-	return __xfs_dir3_data_entsize(ftype, n);
-}
-
-static inline __uint8_t
-xfs_dir3_dirent_get_ftype(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_data_entry *dep)
-{
-	if (xfs_sb_version_hasftype(&mp->m_sb)) {
-		__uint8_t	type = dep->name[dep->namelen];
-
-		ASSERT(type < XFS_DIR3_FT_MAX);
-		if (type < XFS_DIR3_FT_MAX)
-			return type;
-
-	}
-	return XFS_DIR3_FT_UNKNOWN;
-}
-
-static inline void
-xfs_dir3_dirent_put_ftype(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_data_entry *dep,
-	__uint8_t		type)
-{
-	ASSERT(type < XFS_DIR3_FT_MAX);
-	ASSERT(dep->namelen != 0);
-
-	if (xfs_sb_version_hasftype(&mp->m_sb))
-		dep->name[dep->namelen] = type;
-}
-
-/*
- * Pointer to an entry's tag word.
- */
-static inline __be16 *
-xfs_dir3_data_entry_tag_p(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_data_entry *dep)
-{
-	return (__be16 *)((char *)dep +
-		xfs_dir3_data_entsize(mp, dep->namelen) - sizeof(__be16));
-}
-
 /*
  * Pointer to a freespace's tag word.
  */
@@ -561,63 +495,6 @@ xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
 		((char *)hdr + xfs_dir3_data_entry_offset(hdr));
 }
 
-/*
- * Offsets of . and .. in data space (always block 0)
- *
- * XXX: there is scope for significant optimisation of the logic here. Right
- * now we are checking for "dir3 format" over and over again. Ideally we should
- * only do it once for each operation.
- */
-static inline xfs_dir2_data_aoff_t
-xfs_dir3_data_dot_offset(struct xfs_mount *mp)
-{
-	return xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&mp->m_sb));
-}
-
-static inline xfs_dir2_data_aoff_t
-xfs_dir3_data_dotdot_offset(struct xfs_mount *mp)
-{
-	return xfs_dir3_data_dot_offset(mp) +
-		xfs_dir3_data_entsize(mp, 1);
-}
-
-static inline xfs_dir2_data_aoff_t
-xfs_dir3_data_first_offset(struct xfs_mount *mp)
-{
-	return xfs_dir3_data_dotdot_offset(mp) +
-		xfs_dir3_data_entsize(mp, 2);
-}
-
-/*
- * location of . and .. in data space (always block 0)
- */
-static inline struct xfs_dir2_data_entry *
-xfs_dir3_data_dot_entry_p(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_data_hdr *hdr)
-{
-	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + xfs_dir3_data_dot_offset(mp));
-}
-
-static inline struct xfs_dir2_data_entry *
-xfs_dir3_data_dotdot_entry_p(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_data_hdr *hdr)
-{
-	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + xfs_dir3_data_dotdot_offset(mp));
-}
-
-static inline struct xfs_dir2_data_entry *
-xfs_dir3_data_first_entry_p(
-	struct xfs_mount	*mp,
-	struct xfs_dir2_data_hdr *hdr)
-{
-	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + xfs_dir3_data_first_offset(mp));
-}
-
 /*
  * Leaf block structures.
  *
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 0e94b3e662af..c0a45923cce0 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -50,6 +50,22 @@ struct xfs_dir_ops {
 	xfs_ino_t (*sf_get_parent_ino)(struct xfs_dir2_sf_hdr *hdr);
 	void	(*sf_put_parent_ino)(struct xfs_dir2_sf_hdr *hdr,
 				     xfs_ino_t ino);
+
+	int	(*data_entsize)(int len);
+	__uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep);
+	void	(*data_put_ftype)(struct xfs_dir2_data_entry *dep,
+				__uint8_t ftype);
+	__be16 * (*data_entry_tag_p)(struct xfs_dir2_data_entry *dep);
+
+	xfs_dir2_data_aoff_t (*data_dot_offset)(void);
+	xfs_dir2_data_aoff_t (*data_dotdot_offset)(void);
+	xfs_dir2_data_aoff_t (*data_first_offset)(void);
+	struct xfs_dir2_data_entry *
+		(*data_dot_entry_p)(struct xfs_dir2_data_hdr *hdr);
+	struct xfs_dir2_data_entry *
+		(*data_dotdot_entry_p)(struct xfs_dir2_data_hdr *hdr);
+	struct xfs_dir2_data_entry *
+		(*data_first_entry_p)(struct xfs_dir2_data_hdr *hdr);
 };
 
 extern const struct xfs_dir_ops xfs_dir2_ops;
@@ -95,10 +111,10 @@ extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
 extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
 				struct xfs_buf *bp);
 
-extern void xfs_dir2_data_freescan(struct xfs_mount *mp,
+extern void xfs_dir2_data_freescan(struct xfs_inode *dp,
 		struct xfs_dir2_data_hdr *hdr, int *loghead);
-extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp,
-		struct xfs_dir2_data_entry *dep);
+extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_inode *dp,
+		struct xfs_buf *bp, struct xfs_dir2_data_entry *dep);
 extern void xfs_dir2_data_log_header(struct xfs_trans *tp,
 		struct xfs_buf *bp);
 extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_buf *bp,
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 960f3ab526f6..507ef6a7d1f9 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -280,6 +280,7 @@ out:
 static void
 xfs_dir2_block_compact(
 	struct xfs_trans		*tp,
+	struct xfs_inode		*dp,
 	struct xfs_buf			*bp,
 	struct xfs_dir2_data_hdr	*hdr,
 	struct xfs_dir2_block_tail	*btp,
@@ -323,7 +324,7 @@ xfs_dir2_block_compact(
 	 * This needs to happen before the next call to use_free.
 	 */
 	if (needscan)
-		xfs_dir2_data_freescan(tp->t_mountp, hdr, needlog);
+		xfs_dir2_data_freescan(dp, hdr, needlog);
 }
 
 /*
@@ -369,7 +370,7 @@ xfs_dir2_block_addname(
 	if (error)
 		return error;
 
-	len = xfs_dir3_data_entsize(mp, args->namelen);
+	len = dp->d_ops->data_entsize(args->namelen);
 
 	/*
 	 * Set up pointers to parts of the block.
@@ -418,7 +419,7 @@ xfs_dir2_block_addname(
 	 * If need to compact the leaf entries, do it now.
 	 */
 	if (compact) {
-		xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog,
+		xfs_dir2_block_compact(tp, dp, bp, hdr, btp, blp, &needlog,
 				      &lfloghigh, &lfloglow);
 		/* recalculate blp post-compaction */
 		blp = xfs_dir2_block_leaf_p(btp);
@@ -468,7 +469,7 @@ xfs_dir2_block_addname(
 		 * This needs to happen before the next call to use_free.
 		 */
 		if (needscan) {
-			xfs_dir2_data_freescan(mp, hdr, &needlog);
+			xfs_dir2_data_freescan(dp, hdr, &needlog);
 			needscan = 0;
 		}
 		/*
@@ -549,18 +550,18 @@ xfs_dir2_block_addname(
 	dep->inumber = cpu_to_be64(args->inumber);
 	dep->namelen = args->namelen;
 	memcpy(dep->name, args->name, args->namelen);
-	xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
-	tagp = xfs_dir3_data_entry_tag_p(mp, dep);
+	dp->d_ops->data_put_ftype(dep, args->filetype);
+	tagp = dp->d_ops->data_entry_tag_p(dep);
 	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
 	/*
 	 * Clean up the bestfree array and log the header, tail, and entry.
 	 */
 	if (needscan)
-		xfs_dir2_data_freescan(mp, hdr, &needlog);
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
 	if (needlog)
 		xfs_dir2_data_log_header(tp, bp);
 	xfs_dir2_block_log_tail(tp, bp);
-	xfs_dir2_data_log_entry(tp, bp, dep);
+	xfs_dir2_data_log_entry(tp, dp, bp, dep);
 	xfs_dir3_data_check(dp, bp);
 	return 0;
 }
@@ -642,7 +643,7 @@ xfs_dir2_block_lookup(
 	 * Fill in inode number, CI name if appropriate, release the block.
 	 */
 	args->inumber = be64_to_cpu(dep->inumber);
-	args->filetype = xfs_dir3_dirent_get_ftype(mp, dep);
+	args->filetype = dp->d_ops->data_get_ftype(dep);
 	error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
 	xfs_trans_brelse(args->trans, bp);
 	return XFS_ERROR(error);
@@ -801,7 +802,7 @@ xfs_dir2_block_removename(
 	needlog = needscan = 0;
 	xfs_dir2_data_make_free(tp, bp,
 		(xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
-		xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan);
+		dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
 	/*
 	 * Fix up the block tail.
 	 */
@@ -816,7 +817,7 @@ xfs_dir2_block_removename(
 	 * Fix up bestfree, log the header if necessary.
 	 */
 	if (needscan)
-		xfs_dir2_data_freescan(mp, hdr, &needlog);
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
 	if (needlog)
 		xfs_dir2_data_log_header(tp, bp);
 	xfs_dir3_data_check(dp, bp);
@@ -875,8 +876,8 @@ xfs_dir2_block_replace(
 	 * Change the inode number to the new value.
 	 */
 	dep->inumber = cpu_to_be64(args->inumber);
-	xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
-	xfs_dir2_data_log_entry(args->trans, bp, dep);
+	dp->d_ops->data_put_ftype(dep, args->filetype);
+	xfs_dir2_data_log_entry(args->trans, dp, bp, dep);
 	xfs_dir3_data_check(dp, bp);
 	return 0;
 }
@@ -1023,7 +1024,7 @@ xfs_dir2_leaf_to_block(
 	 * Scan the bestfree if we need it and log the data block header.
 	 */
 	if (needscan)
-		xfs_dir2_data_freescan(mp, hdr, &needlog);
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
 	if (needlog)
 		xfs_dir2_data_log_header(tp, dbp);
 	/*
@@ -1158,32 +1159,32 @@ xfs_dir2_sf_to_block(
 	/*
 	 * Create entry for .
 	 */
-	dep = xfs_dir3_data_dot_entry_p(mp, hdr);
+	dep = dp->d_ops->data_dot_entry_p(hdr);
 	dep->inumber = cpu_to_be64(dp->i_ino);
 	dep->namelen = 1;
 	dep->name[0] = '.';
-	xfs_dir3_dirent_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
-	tagp = xfs_dir3_data_entry_tag_p(mp, dep);
+	dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
+	tagp = dp->d_ops->data_entry_tag_p(dep);
 	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
-	xfs_dir2_data_log_entry(tp, bp, dep);
+	xfs_dir2_data_log_entry(tp, dp, bp, dep);
 	blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
 	blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
 				(char *)dep - (char *)hdr));
 	/*
 	 * Create entry for ..
 	 */
-	dep = xfs_dir3_data_dotdot_entry_p(mp, hdr);
+	dep = dp->d_ops->data_dotdot_entry_p(hdr);
 	dep->inumber = cpu_to_be64(dp->d_ops->sf_get_parent_ino(sfp));
 	dep->namelen = 2;
 	dep->name[0] = dep->name[1] = '.';
-	xfs_dir3_dirent_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
-	tagp = xfs_dir3_data_entry_tag_p(mp, dep);
+	dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
+	tagp = dp->d_ops->data_entry_tag_p(dep);
 	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
-	xfs_dir2_data_log_entry(tp, bp, dep);
+	xfs_dir2_data_log_entry(tp, dp, bp, dep);
 	blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
 	blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
 				(char *)dep - (char *)hdr));
-	offset = xfs_dir3_data_first_offset(mp);
+	offset = dp->d_ops->data_first_offset();
 	/*
 	 * Loop over existing entries, stuff them in.
 	 */
@@ -1224,12 +1225,11 @@ xfs_dir2_sf_to_block(
 		dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset);
 		dep->inumber = cpu_to_be64(dp->d_ops->sf_get_ino(sfp, sfep));
 		dep->namelen = sfep->namelen;
-		xfs_dir3_dirent_put_ftype(mp, dep,
-					  dp->d_ops->sf_get_ftype(sfep));
+		dp->d_ops->data_put_ftype(dep, dp->d_ops->sf_get_ftype(sfep));
 		memcpy(dep->name, sfep->name, dep->namelen);
-		tagp = xfs_dir3_data_entry_tag_p(mp, dep);
+		tagp = dp->d_ops->data_entry_tag_p(dep);
 		*tagp = cpu_to_be16((char *)dep - (char *)hdr);
-		xfs_dir2_data_log_entry(tp, bp, dep);
+		xfs_dir2_data_log_entry(tp, dp, bp, dep);
 		name.name = sfep->name;
 		name.len = sfep->namelen;
 		blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops->
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index ccfeb4d8376a..00006b3db054 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -62,12 +62,26 @@ __xfs_dir3_data_check(
 	char			*p;		/* current data position */
 	int			stale;		/* count of stale leaves */
 	struct xfs_name		name;
+	const struct xfs_dir_ops *ops;
 
 	mp = bp->b_target->bt_mount;
 	hdr = bp->b_addr;
 	bf = xfs_dir3_data_bestfree_p(hdr);
 	p = (char *)xfs_dir3_data_entry_p(hdr);
 
+	/*
+	 * We can be passed a null dp here from a verifier, so manually
+	 * configure the ops here in that case.
+	 */
+	if (dp)
+		ops = dp->d_ops;
+	else if (xfs_sb_version_hascrc(&mp->m_sb))
+		ops = &xfs_dir3_ops;
+	else if (xfs_sb_version_hasftype(&mp->m_sb))
+		ops = &xfs_dir2_ftype_ops;
+	else
+		ops = &xfs_dir2_ops;
+
 	switch (hdr->magic) {
 	case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
 	case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
@@ -146,10 +160,10 @@ __xfs_dir3_data_check(
 		XFS_WANT_CORRUPTED_RETURN(
 			!xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
 		XFS_WANT_CORRUPTED_RETURN(
-			be16_to_cpu(*xfs_dir3_data_entry_tag_p(mp, dep)) ==
+			be16_to_cpu(*ops->data_entry_tag_p(dep)) ==
 					       (char *)dep - (char *)hdr);
 		XFS_WANT_CORRUPTED_RETURN(
-			xfs_dir3_dirent_get_ftype(mp, dep) < XFS_DIR3_FT_MAX);
+				ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX);
 		count++;
 		lastfree = 0;
 		if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
@@ -167,7 +181,7 @@ __xfs_dir3_data_check(
 			}
 			XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
 		}
-		p += xfs_dir3_data_entsize(mp, dep->namelen);
+		p += ops->data_entsize(dep->namelen);
 	}
 	/*
 	 * Need to have seen all the entries and all the bestfree slots.
@@ -485,9 +499,9 @@ xfs_dir2_data_freeremove(
  */
 void
 xfs_dir2_data_freescan(
-	xfs_mount_t		*mp,		/* filesystem mount point */
-	xfs_dir2_data_hdr_t	*hdr,		/* data block header */
-	int			*loghead)	/* out: log data header */
+	struct xfs_inode	*dp,
+	struct xfs_dir2_data_hdr *hdr,
+	int			*loghead)
 {
 	xfs_dir2_block_tail_t	*btp;		/* block tail */
 	xfs_dir2_data_entry_t	*dep;		/* active data entry */
@@ -513,10 +527,10 @@ xfs_dir2_data_freescan(
 	p = (char *)xfs_dir3_data_entry_p(hdr);
 	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
 	    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
-		btp = xfs_dir2_block_tail_p(mp, hdr);
+		btp = xfs_dir2_block_tail_p(dp->i_mount, hdr);
 		endp = (char *)xfs_dir2_block_leaf_p(btp);
 	} else
-		endp = (char *)hdr + mp->m_dirblksize;
+		endp = (char *)hdr + dp->i_mount->m_dirblksize;
 	/*
 	 * Loop over the block's entries.
 	 */
@@ -537,8 +551,8 @@ xfs_dir2_data_freescan(
 		else {
 			dep = (xfs_dir2_data_entry_t *)p;
 			ASSERT((char *)dep - (char *)hdr ==
-			       be16_to_cpu(*xfs_dir3_data_entry_tag_p(mp, dep)));
-			p += xfs_dir3_data_entsize(mp, dep->namelen);
+			       be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep)));
+			p += dp->d_ops->data_entsize(dep->namelen);
 		}
 	}
 }
@@ -625,11 +639,11 @@ xfs_dir3_data_init(
 void
 xfs_dir2_data_log_entry(
 	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
 	struct xfs_buf		*bp,
 	xfs_dir2_data_entry_t	*dep)		/* data entry pointer */
 {
 	struct xfs_dir2_data_hdr *hdr = bp->b_addr;
-	struct xfs_mount	*mp = tp->t_mountp;
 
 	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
 	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
@@ -637,7 +651,7 @@ xfs_dir2_data_log_entry(
 	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
 
 	xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr),
-		(uint)((char *)(xfs_dir3_data_entry_tag_p(mp, dep) + 1) -
+		(uint)((char *)(dp->d_ops->data_entry_tag_p(dep) + 1) -
 		       (char *)hdr - 1));
 }
 
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 51fdc11a1e2c..dd195363ccf2 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -500,7 +500,7 @@ xfs_dir2_block_to_leaf(
 		hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
 
 	if (needscan)
-		xfs_dir2_data_freescan(mp, hdr, &needlog);
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
 	/*
 	 * Set up leaf tail and bests table.
 	 */
@@ -700,7 +700,7 @@ xfs_dir2_leaf_addname(
 	ents = xfs_dir3_leaf_ents_p(leaf);
 	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
 	bestsp = xfs_dir2_leaf_bests_p(ltp);
-	length = xfs_dir3_data_entsize(mp, args->namelen);
+	length = dp->d_ops->data_entsize(args->namelen);
 
 	/*
 	 * See if there are any entries with the same hash value
@@ -901,20 +901,20 @@ xfs_dir2_leaf_addname(
 	dep->inumber = cpu_to_be64(args->inumber);
 	dep->namelen = args->namelen;
 	memcpy(dep->name, args->name, dep->namelen);
-	xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
-	tagp = xfs_dir3_data_entry_tag_p(mp, dep);
+	dp->d_ops->data_put_ftype(dep, args->filetype);
+	tagp = dp->d_ops->data_entry_tag_p(dep);
 	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
 	/*
 	 * Need to scan fix up the bestfree table.
 	 */
 	if (needscan)
-		xfs_dir2_data_freescan(mp, hdr, &needlog);
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
 	/*
 	 * Need to log the data block's header.
 	 */
 	if (needlog)
 		xfs_dir2_data_log_header(tp, dbp);
-	xfs_dir2_data_log_entry(tp, dbp, dep);
+	xfs_dir2_data_log_entry(tp, dp, dbp, dep);
 	/*
 	 * If the bests table needs to be changed, do it.
 	 * Log the change unless we've already done that.
@@ -1230,7 +1230,7 @@ xfs_dir2_leaf_lookup(
 	 * Return the found inode number & CI name if appropriate
 	 */
 	args->inumber = be64_to_cpu(dep->inumber);
-	args->filetype = xfs_dir3_dirent_get_ftype(dp->i_mount, dep);
+	args->filetype = dp->d_ops->data_get_ftype(dep);
 	error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
 	xfs_trans_brelse(tp, dbp);
 	xfs_trans_brelse(tp, lbp);
@@ -1433,7 +1433,7 @@ xfs_dir2_leaf_removename(
 	 */
 	xfs_dir2_data_make_free(tp, dbp,
 		(xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
-		xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan);
+		dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
 	/*
 	 * We just mark the leaf entry stale by putting a null in it.
 	 */
@@ -1449,7 +1449,7 @@ xfs_dir2_leaf_removename(
 	 * log the data block header if necessary.
 	 */
 	if (needscan)
-		xfs_dir2_data_freescan(mp, hdr, &needlog);
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
 	if (needlog)
 		xfs_dir2_data_log_header(tp, dbp);
 	/*
@@ -1561,9 +1561,9 @@ xfs_dir2_leaf_replace(
 	 * Put the new inode number in, log it.
 	 */
 	dep->inumber = cpu_to_be64(args->inumber);
-	xfs_dir3_dirent_put_ftype(dp->i_mount, dep, args->filetype);
+	dp->d_ops->data_put_ftype(dep, args->filetype);
 	tp = args->trans;
-	xfs_dir2_data_log_entry(tp, dbp, dep);
+	xfs_dir2_data_log_entry(tp, dp, dbp, dep);
 	xfs_dir3_leaf_check(dp->i_mount, lbp);
 	xfs_trans_brelse(tp, lbp);
 	return 0;
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index b8381646b8af..a1d133981c14 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -604,7 +604,7 @@ xfs_dir2_leafn_lookup_for_addname(
 		ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
 		       free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
 	}
-	length = xfs_dir3_data_entsize(mp, args->namelen);
+	length = dp->d_ops->data_entsize(args->namelen);
 	/*
 	 * Loop over leaf entries with the right hash value.
 	 */
@@ -815,7 +815,7 @@ xfs_dir2_leafn_lookup_for_entry(
 				xfs_trans_brelse(tp, state->extrablk.bp);
 			args->cmpresult = cmp;
 			args->inumber = be64_to_cpu(dep->inumber);
-			args->filetype = xfs_dir3_dirent_get_ftype(mp, dep);
+			args->filetype = dp->d_ops->data_get_ftype(dep);
 			*indexp = index;
 			state->extravalid = 1;
 			state->extrablk.bp = curbp;
@@ -1259,13 +1259,13 @@ xfs_dir2_leafn_remove(
 	longest = be16_to_cpu(bf[0].length);
 	needlog = needscan = 0;
 	xfs_dir2_data_make_free(tp, dbp, off,
-		xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan);
+		dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
 	/*
 	 * Rescan the data block freespaces for bestfree.
 	 * Log the data block header if needed.
 	 */
 	if (needscan)
-		xfs_dir2_data_freescan(mp, hdr, &needlog);
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
 	if (needlog)
 		xfs_dir2_data_log_header(tp, dbp);
 	xfs_dir3_data_check(dp, dbp);
@@ -1711,7 +1711,7 @@ xfs_dir2_node_addname_int(
 	dp = args->dp;
 	mp = dp->i_mount;
 	tp = args->trans;
-	length = xfs_dir3_data_entsize(mp, args->namelen);
+	length = dp->d_ops->data_entsize(args->namelen);
 	/*
 	 * If we came in with a freespace block that means that lookup
 	 * found an entry with our hash value.  This is the freespace
@@ -2007,15 +2007,15 @@ xfs_dir2_node_addname_int(
 	dep->inumber = cpu_to_be64(args->inumber);
 	dep->namelen = args->namelen;
 	memcpy(dep->name, args->name, dep->namelen);
-	xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
-	tagp = xfs_dir3_data_entry_tag_p(mp, dep);
+	dp->d_ops->data_put_ftype(dep, args->filetype);
+	tagp = dp->d_ops->data_entry_tag_p(dep);
 	*tagp = cpu_to_be16((char *)dep - (char *)hdr);
-	xfs_dir2_data_log_entry(tp, dbp, dep);
+	xfs_dir2_data_log_entry(tp, dp, dbp, dep);
 	/*
 	 * Rescan the block for bestfree if needed.
 	 */
 	if (needscan)
-		xfs_dir2_data_freescan(mp, hdr, &needlog);
+		xfs_dir2_data_freescan(dp, hdr, &needlog);
 	/*
 	 * Log the data block header if needed.
 	 */
@@ -2228,8 +2228,9 @@ xfs_dir2_node_replace(
 		 * Fill in the new inode number and log the entry.
 		 */
 		dep->inumber = cpu_to_be64(inum);
-		xfs_dir3_dirent_put_ftype(state->mp, dep, args->filetype);
-		xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep);
+		args->dp->d_ops->data_put_ftype(dep, args->filetype);
+		xfs_dir2_data_log_entry(args->trans, args->dp,
+					state->extrablk.bp, dep);
 		rval = 0;
 	}
 	/*
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 2d2c8fb05541..b99aa7d48e90 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -119,9 +119,9 @@ xfs_dir2_sf_getdents(
 	 * mp->m_dirdatablk.
 	 */
 	dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
-					     xfs_dir3_data_dot_offset(mp));
+						dp->d_ops->data_dot_offset());
 	dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
-						xfs_dir3_data_dotdot_offset(mp));
+						dp->d_ops->data_dotdot_offset());
 
 	/*
 	 * Put . entry unless we're starting past it.
@@ -237,7 +237,7 @@ xfs_dir2_block_getdents(
 		/*
 		 * Bump pointer for the next iteration.
 		 */
-		ptr += xfs_dir3_data_entsize(mp, dep->namelen);
+		ptr += dp->d_ops->data_entsize(dep->namelen);
 		/*
 		 * The entry is before the desired starting point, skip it.
 		 */
@@ -248,7 +248,7 @@ xfs_dir2_block_getdents(
 					    (char *)dep - (char *)hdr);
 
 		ctx->pos = cook & 0x7fffffff;
-		filetype = xfs_dir3_dirent_get_ftype(mp, dep);
+		filetype = dp->d_ops->data_get_ftype(dep);
 		/*
 		 * If it didn't fit, set the final offset to here & return.
 		 */
@@ -601,7 +601,7 @@ xfs_dir2_leaf_getdents(
 					}
 					dep = (xfs_dir2_data_entry_t *)ptr;
 					length =
-					   xfs_dir3_data_entsize(mp, dep->namelen);
+					   dp->d_ops->data_entsize(dep->namelen);
 					ptr += length;
 				}
 				/*
@@ -632,8 +632,8 @@ xfs_dir2_leaf_getdents(
 		}
 
 		dep = (xfs_dir2_data_entry_t *)ptr;
-		length = xfs_dir3_data_entsize(mp, dep->namelen);
-		filetype = xfs_dir3_dirent_get_ftype(mp, dep);
+		length = dp->d_ops->data_entsize(dep->namelen);
+		filetype = dp->d_ops->data_get_ftype(dep);
 
 		ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
 		if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index ec0d39b5fa12..953e19479151 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -251,11 +251,11 @@ xfs_dir2_block_to_sf(
 			dp->d_ops->sf_put_ino(sfp, sfep,
 					      be64_to_cpu(dep->inumber));
 			dp->d_ops->sf_put_ftype(sfep,
-					xfs_dir3_dirent_get_ftype(mp, dep));
+					dp->d_ops->data_get_ftype(dep));
 
 			sfep = dp->d_ops->sf_nextentry(sfp, sfep);
 		}
-		ptr += xfs_dir3_data_entsize(mp, dep->namelen);
+		ptr += dp->d_ops->data_entsize(dep->namelen);
 	}
 	ASSERT((char *)sfep - (char *)sfp == size);
 	xfs_dir2_sf_check(args);
@@ -473,12 +473,12 @@ xfs_dir2_sf_addname_hard(
 	 * to insert the new entry.
 	 * If it's going to end up at the end then oldsfep will point there.
 	 */
-	for (offset = xfs_dir3_data_first_offset(mp),
+	for (offset = dp->d_ops->data_first_offset(),
 	      oldsfep = xfs_dir2_sf_firstentry(oldsfp),
-	      add_datasize = xfs_dir3_data_entsize(mp, args->namelen),
+	      add_datasize = dp->d_ops->data_entsize(args->namelen),
 	      eof = (char *)oldsfep == &buf[old_isize];
 	     !eof;
-	     offset = new_offset + xfs_dir3_data_entsize(mp, oldsfep->namelen),
+	     offset = new_offset + dp->d_ops->data_entsize(oldsfep->namelen),
 	      oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep),
 	      eof = (char *)oldsfep == &buf[old_isize]) {
 		new_offset = xfs_dir2_sf_get_offset(oldsfep);
@@ -555,8 +555,8 @@ xfs_dir2_sf_addname_pick(
 	mp = dp->i_mount;
 
 	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	size = xfs_dir3_data_entsize(mp, args->namelen);
-	offset = xfs_dir3_data_first_offset(mp);
+	size = dp->d_ops->data_entsize(args->namelen);
+	offset = dp->d_ops->data_first_offset();
 	sfep = xfs_dir2_sf_firstentry(sfp);
 	holefit = 0;
 	/*
@@ -568,7 +568,7 @@ xfs_dir2_sf_addname_pick(
 		if (!holefit)
 			holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
 		offset = xfs_dir2_sf_get_offset(sfep) +
-			 xfs_dir3_data_entsize(mp, sfep->namelen);
+			 dp->d_ops->data_entsize(sfep->namelen);
 		sfep = dp->d_ops->sf_nextentry(sfp, sfep);
 	}
 	/*
@@ -629,7 +629,7 @@ xfs_dir2_sf_check(
 	mp = dp->i_mount;
 
 	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	offset = xfs_dir3_data_first_offset(mp);
+	offset = dp->d_ops->data_first_offset();
 	ino = dp->d_ops->sf_get_parent_ino(sfp);
 	i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
 
@@ -641,7 +641,7 @@ xfs_dir2_sf_check(
 		i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
 		offset =
 			xfs_dir2_sf_get_offset(sfep) +
-			xfs_dir3_data_entsize(mp, sfep->namelen);
+			dp->d_ops->data_entsize(sfep->namelen);
 		ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX);
 	}
 	ASSERT(i8count == sfp->i8count);
-- 
cgit v1.3


From 2ca9877410594dd321621efc9e1cbcd1ca24d0f3 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 29 Oct 2013 22:11:49 +1100
Subject: xfs: vectorise directory data operations part 2

Convert the rest of the directory data block encode/decode
operations to vector format.

This further reduces the size of the built binary:

   text    data     bss     dec     hex filename
 794490   96802    1096  892388   d9de4 fs/xfs/xfs.o.orig
 792986   96802    1096  890884   d9804 fs/xfs/xfs.o.p1
 792350   96802    1096  890248   d9588 fs/xfs/xfs.o.p2
 789293   96802    1096  887191   d8997 fs/xfs/xfs.o.p3
 789005   96802    1096  886903   d8997 fs/xfs/xfs.o.p4

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_da_format.c    |  68 +++++++++++++++++++++++++++++
 fs/xfs/xfs_da_format.h    |  41 ------------------
 fs/xfs/xfs_dir2.h         |  22 +++++++---
 fs/xfs/xfs_dir2_block.c   |  33 ++++++++-------
 fs/xfs/xfs_dir2_data.c    | 106 +++++++++++++++++++++++++---------------------
 fs/xfs/xfs_dir2_leaf.c    |  26 ++++++------
 fs/xfs/xfs_dir2_node.c    |  16 +++----
 fs/xfs/xfs_dir2_priv.h    |   3 +-
 fs/xfs/xfs_dir2_readdir.c |   6 +--
 fs/xfs/xfs_dir2_sf.c      |   2 +-
 10 files changed, 186 insertions(+), 137 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/xfs_da_format.c
index d0285fdfb6e7..1e92a241fd52 100644
--- a/fs/xfs/xfs_da_format.c
+++ b/fs/xfs/xfs_da_format.c
@@ -383,6 +383,58 @@ xfs_dir3_data_first_entry_p(
 		((char *)hdr + xfs_dir3_data_first_offset());
 }
 
+static struct xfs_dir2_data_free *
+xfs_dir2_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return hdr->bestfree;
+}
+
+static struct xfs_dir2_data_free *
+xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return ((struct xfs_dir3_data_hdr *)hdr)->best_free;
+}
+
+static size_t
+xfs_dir2_data_entry_offset(void)
+{
+	return sizeof(struct xfs_dir2_data_hdr);
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + xfs_dir2_data_entry_offset());
+}
+
+static struct xfs_dir2_data_unused *
+xfs_dir2_data_unused_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_unused *)
+		((char *)hdr + xfs_dir2_data_entry_offset());
+}
+
+static size_t
+xfs_dir3_data_entry_offset(void)
+{
+	return sizeof(struct xfs_dir3_data_hdr);
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + xfs_dir3_data_entry_offset());
+}
+
+static struct xfs_dir2_data_unused *
+xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_unused *)
+		((char *)hdr + xfs_dir3_data_entry_offset());
+}
+
 const struct xfs_dir_ops xfs_dir2_ops = {
 	.sf_entsize = xfs_dir2_sf_entsize,
 	.sf_nextentry = xfs_dir2_sf_nextentry,
@@ -397,13 +449,19 @@ const struct xfs_dir_ops xfs_dir2_ops = {
 	.data_get_ftype = xfs_dir2_data_get_ftype,
 	.data_put_ftype = xfs_dir2_data_put_ftype,
 	.data_entry_tag_p = xfs_dir2_data_entry_tag_p,
+	.data_bestfree_p = xfs_dir2_data_bestfree_p,
 
 	.data_dot_offset = xfs_dir2_data_dot_offset,
 	.data_dotdot_offset = xfs_dir2_data_dotdot_offset,
 	.data_first_offset = xfs_dir2_data_first_offset,
+	.data_entry_offset = xfs_dir2_data_entry_offset,
+
 	.data_dot_entry_p = xfs_dir2_data_dot_entry_p,
 	.data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p,
 	.data_first_entry_p = xfs_dir2_data_first_entry_p,
+	.data_entry_p = xfs_dir2_data_entry_p,
+	.data_unused_p = xfs_dir2_data_unused_p,
+
 };
 
 const struct xfs_dir_ops xfs_dir2_ftype_ops = {
@@ -420,13 +478,18 @@ const struct xfs_dir_ops xfs_dir2_ftype_ops = {
 	.data_get_ftype = xfs_dir3_data_get_ftype,
 	.data_put_ftype = xfs_dir3_data_put_ftype,
 	.data_entry_tag_p = xfs_dir3_data_entry_tag_p,
+	.data_bestfree_p = xfs_dir2_data_bestfree_p,
 
 	.data_dot_offset = xfs_dir2_data_dot_offset,
 	.data_dotdot_offset = xfs_dir2_data_dotdot_offset,
 	.data_first_offset = xfs_dir2_data_first_offset,
+	.data_entry_offset = xfs_dir2_data_entry_offset,
+
 	.data_dot_entry_p = xfs_dir2_data_dot_entry_p,
 	.data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p,
 	.data_first_entry_p = xfs_dir2_data_first_entry_p,
+	.data_entry_p = xfs_dir2_data_entry_p,
+	.data_unused_p = xfs_dir2_data_unused_p,
 };
 
 const struct xfs_dir_ops xfs_dir3_ops = {
@@ -443,11 +506,16 @@ const struct xfs_dir_ops xfs_dir3_ops = {
 	.data_get_ftype = xfs_dir3_data_get_ftype,
 	.data_put_ftype = xfs_dir3_data_put_ftype,
 	.data_entry_tag_p = xfs_dir3_data_entry_tag_p,
+	.data_bestfree_p = xfs_dir3_data_bestfree_p,
 
 	.data_dot_offset = xfs_dir3_data_dot_offset,
 	.data_dotdot_offset = xfs_dir3_data_dotdot_offset,
 	.data_first_offset = xfs_dir3_data_first_offset,
+	.data_entry_offset = xfs_dir3_data_entry_offset,
+
 	.data_dot_entry_p = xfs_dir3_data_dot_entry_p,
 	.data_dotdot_entry_p = xfs_dir3_data_dotdot_entry_p,
 	.data_first_entry_p = xfs_dir3_data_first_entry_p,
+	.data_entry_p = xfs_dir3_data_entry_p,
+	.data_unused_p = xfs_dir3_data_unused_p,
 };
diff --git a/fs/xfs/xfs_da_format.h b/fs/xfs/xfs_da_format.h
index 5b72dd2f8a69..aeca0038a013 100644
--- a/fs/xfs/xfs_da_format.h
+++ b/fs/xfs/xfs_da_format.h
@@ -412,17 +412,6 @@ struct xfs_dir3_data_hdr {
 
 #define XFS_DIR3_DATA_CRC_OFF  offsetof(struct xfs_dir3_data_hdr, hdr.crc)
 
-static inline struct xfs_dir2_data_free *
-xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
-{
-	if (hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-	    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
-		struct xfs_dir3_data_hdr *hdr3 = (struct xfs_dir3_data_hdr *)hdr;
-		return hdr3->best_free;
-	}
-	return hdr->bestfree;
-}
-
 /*
  * Active entry in a data block.
  *
@@ -465,36 +454,6 @@ xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup)
 			be16_to_cpu(dup->length) - sizeof(__be16));
 }
 
-static inline size_t
-xfs_dir3_data_hdr_size(bool dir3)
-{
-	if (dir3)
-		return sizeof(struct xfs_dir3_data_hdr);
-	return sizeof(struct xfs_dir2_data_hdr);
-}
-
-static inline size_t
-xfs_dir3_data_entry_offset(struct xfs_dir2_data_hdr *hdr)
-{
-	bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-		    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
-	return xfs_dir3_data_hdr_size(dir3);
-}
-
-static inline struct xfs_dir2_data_entry *
-xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr)
-{
-	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + xfs_dir3_data_entry_offset(hdr));
-}
-
-static inline struct xfs_dir2_data_unused *
-xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
-{
-	return (struct xfs_dir2_data_unused *)
-		((char *)hdr + xfs_dir3_data_entry_offset(hdr));
-}
-
 /*
  * Leaf block structures.
  *
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index c0a45923cce0..076010a1376a 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -56,16 +56,24 @@ struct xfs_dir_ops {
 	void	(*data_put_ftype)(struct xfs_dir2_data_entry *dep,
 				__uint8_t ftype);
 	__be16 * (*data_entry_tag_p)(struct xfs_dir2_data_entry *dep);
+	struct xfs_dir2_data_free *
+		(*data_bestfree_p)(struct xfs_dir2_data_hdr *hdr);
 
 	xfs_dir2_data_aoff_t (*data_dot_offset)(void);
 	xfs_dir2_data_aoff_t (*data_dotdot_offset)(void);
 	xfs_dir2_data_aoff_t (*data_first_offset)(void);
+	size_t	(*data_entry_offset)(void);
+
 	struct xfs_dir2_data_entry *
 		(*data_dot_entry_p)(struct xfs_dir2_data_hdr *hdr);
 	struct xfs_dir2_data_entry *
 		(*data_dotdot_entry_p)(struct xfs_dir2_data_hdr *hdr);
 	struct xfs_dir2_data_entry *
 		(*data_first_entry_p)(struct xfs_dir2_data_hdr *hdr);
+	struct xfs_dir2_data_entry *
+		(*data_entry_p)(struct xfs_dir2_data_hdr *hdr);
+	struct xfs_dir2_data_unused *
+		(*data_unused_p)(struct xfs_dir2_data_hdr *hdr);
 };
 
 extern const struct xfs_dir_ops xfs_dir2_ops;
@@ -115,19 +123,21 @@ extern void xfs_dir2_data_freescan(struct xfs_inode *dp,
 		struct xfs_dir2_data_hdr *hdr, int *loghead);
 extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_inode *dp,
 		struct xfs_buf *bp, struct xfs_dir2_data_entry *dep);
-extern void xfs_dir2_data_log_header(struct xfs_trans *tp,
+extern void xfs_dir2_data_log_header(struct xfs_trans *tp, struct xfs_inode *dp,
 		struct xfs_buf *bp);
 extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_buf *bp,
 		struct xfs_dir2_data_unused *dup);
-extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_buf *bp,
+extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_inode *dp,
+		struct xfs_buf *bp, xfs_dir2_data_aoff_t offset,
+		xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
+extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_inode *dp,
+		struct xfs_buf *bp, struct xfs_dir2_data_unused *dup,
 		xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
 		int *needlogp, int *needscanp);
-extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
-		struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset,
-		xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
 
 extern struct xfs_dir2_data_free *xfs_dir2_data_freefind(
-		struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_unused *dup);
+		struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf,
+		struct xfs_dir2_data_unused *dup);
 
 extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
 extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 507ef6a7d1f9..f92ca5699342 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -168,6 +168,7 @@ xfs_dir3_block_init(
 
 static void
 xfs_dir2_block_need_space(
+	struct xfs_inode		*dp,
 	struct xfs_dir2_data_hdr	*hdr,
 	struct xfs_dir2_block_tail	*btp,
 	struct xfs_dir2_leaf_entry	*blp,
@@ -183,7 +184,7 @@ xfs_dir2_block_need_space(
 	struct xfs_dir2_data_unused	*enddup = NULL;
 
 	*compact = 0;
-	bf = xfs_dir3_data_bestfree_p(hdr);
+	bf = dp->d_ops->data_bestfree_p(hdr);
 
 	/*
 	 * If there are stale entries we'll use one for the leaf.
@@ -313,7 +314,7 @@ xfs_dir2_block_compact(
 	*lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
 	*lfloghigh -= be32_to_cpu(btp->stale) - 1;
 	be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
-	xfs_dir2_data_make_free(tp, bp,
+	xfs_dir2_data_make_free(tp, dp, bp,
 		(xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
 		(xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
 		needlog, &needscan);
@@ -383,7 +384,7 @@ xfs_dir2_block_addname(
 	 * Find out if we can reuse stale entries or whether we need extra
 	 * space for entry and new leaf.
 	 */
-	xfs_dir2_block_need_space(hdr, btp, blp, &tagp, &dup,
+	xfs_dir2_block_need_space(dp, hdr, btp, blp, &tagp, &dup,
 				  &enddup, &compact, len);
 
 	/*
@@ -454,7 +455,7 @@ xfs_dir2_block_addname(
 		/*
 		 * Mark the space needed for the new leaf entry, now in use.
 		 */
-		xfs_dir2_data_use_free(tp, bp, enddup,
+		xfs_dir2_data_use_free(tp, dp, bp, enddup,
 			(xfs_dir2_data_aoff_t)
 			((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) -
 			 sizeof(*blp)),
@@ -541,7 +542,7 @@ xfs_dir2_block_addname(
 	/*
 	 * Mark space for the data entry used.
 	 */
-	xfs_dir2_data_use_free(tp, bp, dup,
+	xfs_dir2_data_use_free(tp, dp, bp, dup,
 		(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
 		(xfs_dir2_data_aoff_t)len, &needlog, &needscan);
 	/*
@@ -559,7 +560,7 @@ xfs_dir2_block_addname(
 	if (needscan)
 		xfs_dir2_data_freescan(dp, hdr, &needlog);
 	if (needlog)
-		xfs_dir2_data_log_header(tp, bp);
+		xfs_dir2_data_log_header(tp, dp, bp);
 	xfs_dir2_block_log_tail(tp, bp);
 	xfs_dir2_data_log_entry(tp, dp, bp, dep);
 	xfs_dir3_data_check(dp, bp);
@@ -800,7 +801,7 @@ xfs_dir2_block_removename(
 	 * Mark the data entry's space free.
 	 */
 	needlog = needscan = 0;
-	xfs_dir2_data_make_free(tp, bp,
+	xfs_dir2_data_make_free(tp, dp, bp,
 		(xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
 		dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
 	/*
@@ -819,7 +820,7 @@ xfs_dir2_block_removename(
 	if (needscan)
 		xfs_dir2_data_freescan(dp, hdr, &needlog);
 	if (needlog)
-		xfs_dir2_data_log_header(tp, bp);
+		xfs_dir2_data_log_header(tp, dp, bp);
 	xfs_dir3_data_check(dp, bp);
 	/*
 	 * See if the size as a shortform is good enough.
@@ -950,7 +951,7 @@ xfs_dir2_leaf_to_block(
 	while (dp->i_d.di_size > mp->m_dirblksize) {
 		int hdrsz;
 
-		hdrsz = xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&mp->m_sb));
+		hdrsz = dp->d_ops->data_entry_offset();
 		bestsp = xfs_dir2_leaf_bests_p(ltp);
 		if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) ==
 					    mp->m_dirblksize - hdrsz) {
@@ -1000,7 +1001,7 @@ xfs_dir2_leaf_to_block(
 	/*
 	 * Use up the space at the end of the block (blp/btp).
 	 */
-	xfs_dir2_data_use_free(tp, dbp, dup, mp->m_dirblksize - size, size,
+	xfs_dir2_data_use_free(tp, dp, dbp, dup, mp->m_dirblksize - size, size,
 		&needlog, &needscan);
 	/*
 	 * Initialize the block tail.
@@ -1026,7 +1027,7 @@ xfs_dir2_leaf_to_block(
 	if (needscan)
 		xfs_dir2_data_freescan(dp, hdr, &needlog);
 	if (needlog)
-		xfs_dir2_data_log_header(tp, dbp);
+		xfs_dir2_data_log_header(tp, dp, dbp);
 	/*
 	 * Pitch the old leaf block.
 	 */
@@ -1137,9 +1138,9 @@ xfs_dir2_sf_to_block(
 	 * The whole thing is initialized to free by the init routine.
 	 * Say we're using the leaf and tail area.
 	 */
-	dup = xfs_dir3_data_unused_p(hdr);
+	dup = dp->d_ops->data_unused_p(hdr);
 	needlog = needscan = 0;
-	xfs_dir2_data_use_free(tp, bp, dup, mp->m_dirblksize - i, i, &needlog,
+	xfs_dir2_data_use_free(tp, dp, bp, dup, mp->m_dirblksize - i, i, &needlog,
 		&needscan);
 	ASSERT(needscan == 0);
 	/*
@@ -1153,7 +1154,7 @@ xfs_dir2_sf_to_block(
 	/*
 	 * Remove the freespace, we'll manage it.
 	 */
-	xfs_dir2_data_use_free(tp, bp, dup,
+	xfs_dir2_data_use_free(tp, dp, bp, dup,
 		(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
 		be16_to_cpu(dup->length), &needlog, &needscan);
 	/*
@@ -1215,7 +1216,9 @@ xfs_dir2_sf_to_block(
 			*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(
 				((char *)dup - (char *)hdr));
 			xfs_dir2_data_log_unused(tp, bp, dup);
-			xfs_dir2_data_freeinsert(hdr, dup, &dummy);
+			xfs_dir2_data_freeinsert(hdr,
+						 dp->d_ops->data_bestfree_p(hdr),
+						 dup, &dummy);
 			offset += be16_to_cpu(dup->length);
 			continue;
 		}
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 00006b3db054..13fa0d6b4ff7 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -66,8 +66,6 @@ __xfs_dir3_data_check(
 
 	mp = bp->b_target->bt_mount;
 	hdr = bp->b_addr;
-	bf = xfs_dir3_data_bestfree_p(hdr);
-	p = (char *)xfs_dir3_data_entry_p(hdr);
 
 	/*
 	 * We can be passed a null dp here from a verifier, so manually
@@ -97,6 +95,8 @@ __xfs_dir3_data_check(
 		XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
 		return EFSCORRUPTED;
 	}
+	bf = ops->data_bestfree_p(hdr);
+	p = (char *)ops->data_entry_p(hdr);
 
 	count = lastfree = freeseen = 0;
 	/*
@@ -134,7 +134,7 @@ __xfs_dir3_data_check(
 			XFS_WANT_CORRUPTED_RETURN(
 				be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
 					       (char *)dup - (char *)hdr);
-			dfp = xfs_dir2_data_freefind(hdr, dup);
+			dfp = xfs_dir2_data_freefind(hdr, bf, dup);
 			if (dfp) {
 				i = (int)(dfp - bf);
 				XFS_WANT_CORRUPTED_RETURN(
@@ -340,19 +340,18 @@ xfs_dir3_data_readahead(
  */
 xfs_dir2_data_free_t *
 xfs_dir2_data_freefind(
-	xfs_dir2_data_hdr_t	*hdr,		/* data block */
-	xfs_dir2_data_unused_t	*dup)		/* data unused entry */
+	struct xfs_dir2_data_hdr *hdr,		/* data block header */
+	struct xfs_dir2_data_free *bf,		/* bestfree table pointer */
+	struct xfs_dir2_data_unused *dup)	/* unused space */
 {
 	xfs_dir2_data_free_t	*dfp;		/* bestfree entry */
 	xfs_dir2_data_aoff_t	off;		/* offset value needed */
-	struct xfs_dir2_data_free *bf;
 #ifdef DEBUG
 	int			matched;	/* matched the value */
 	int			seenzero;	/* saw a 0 bestfree entry */
 #endif
 
 	off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
-	bf = xfs_dir3_data_bestfree_p(hdr);
 
 #ifdef DEBUG
 	/*
@@ -412,11 +411,11 @@ xfs_dir2_data_freefind(
  */
 xfs_dir2_data_free_t *				/* entry inserted */
 xfs_dir2_data_freeinsert(
-	xfs_dir2_data_hdr_t	*hdr,		/* data block pointer */
-	xfs_dir2_data_unused_t	*dup,		/* unused space */
+	struct xfs_dir2_data_hdr *hdr,		/* data block pointer */
+	struct xfs_dir2_data_free *dfp,		/* bestfree table pointer */
+	struct xfs_dir2_data_unused *dup,	/* unused space */
 	int			*loghead)	/* log the data header (out) */
 {
-	xfs_dir2_data_free_t	*dfp;		/* bestfree table pointer */
 	xfs_dir2_data_free_t	new;		/* new bestfree entry */
 
 	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
@@ -424,7 +423,6 @@ xfs_dir2_data_freeinsert(
 	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
 	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
 
-	dfp = xfs_dir3_data_bestfree_p(hdr);
 	new.length = dup->length;
 	new.offset = cpu_to_be16((char *)dup - (char *)hdr);
 
@@ -457,11 +455,11 @@ xfs_dir2_data_freeinsert(
  */
 STATIC void
 xfs_dir2_data_freeremove(
-	xfs_dir2_data_hdr_t	*hdr,		/* data block header */
-	xfs_dir2_data_free_t	*dfp,		/* bestfree entry pointer */
+	struct xfs_dir2_data_hdr *hdr,		/* data block header */
+	struct xfs_dir2_data_free *bf,		/* bestfree table pointer */
+	struct xfs_dir2_data_free *dfp,		/* bestfree entry pointer */
 	int			*loghead)	/* out: log data header */
 {
-	struct xfs_dir2_data_free *bf;
 
 	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
 	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
@@ -471,7 +469,6 @@ xfs_dir2_data_freeremove(
 	/*
 	 * It's the first entry, slide the next 2 up.
 	 */
-	bf = xfs_dir3_data_bestfree_p(hdr);
 	if (dfp == &bf[0]) {
 		bf[0] = bf[1];
 		bf[1] = bf[2];
@@ -518,13 +515,13 @@ xfs_dir2_data_freescan(
 	/*
 	 * Start by clearing the table.
 	 */
-	bf = xfs_dir3_data_bestfree_p(hdr);
+	bf = dp->d_ops->data_bestfree_p(hdr);
 	memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT);
 	*loghead = 1;
 	/*
 	 * Set up pointers.
 	 */
-	p = (char *)xfs_dir3_data_entry_p(hdr);
+	p = (char *)dp->d_ops->data_entry_p(hdr);
 	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
 	    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
 		btp = xfs_dir2_block_tail_p(dp->i_mount, hdr);
@@ -542,7 +539,7 @@ xfs_dir2_data_freescan(
 		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
 			ASSERT((char *)dup - (char *)hdr ==
 			       be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
-			xfs_dir2_data_freeinsert(hdr, dup, loghead);
+			xfs_dir2_data_freeinsert(hdr, bf, dup, loghead);
 			p += be16_to_cpu(dup->length);
 		}
 		/*
@@ -607,8 +604,8 @@ xfs_dir3_data_init(
 	} else
 		hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
 
-	bf = xfs_dir3_data_bestfree_p(hdr);
-	bf[0].offset = cpu_to_be16(xfs_dir3_data_entry_offset(hdr));
+	bf = dp->d_ops->data_bestfree_p(hdr);
+	bf[0].offset = cpu_to_be16(dp->d_ops->data_entry_offset());
 	for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
 		bf[i].length = 0;
 		bf[i].offset = 0;
@@ -617,17 +614,17 @@ xfs_dir3_data_init(
 	/*
 	 * Set up an unused entry for the block's body.
 	 */
-	dup = xfs_dir3_data_unused_p(hdr);
+	dup = dp->d_ops->data_unused_p(hdr);
 	dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
 
-	t = mp->m_dirblksize - (uint)xfs_dir3_data_entry_offset(hdr);
+	t = mp->m_dirblksize - (uint)dp->d_ops->data_entry_offset();
 	bf[0].length = cpu_to_be16(t);
 	dup->length = cpu_to_be16(t);
 	*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr);
 	/*
 	 * Log it and return it.
 	 */
-	xfs_dir2_data_log_header(tp, bp);
+	xfs_dir2_data_log_header(tp, dp, bp);
 	xfs_dir2_data_log_unused(tp, bp, dup);
 	*bpp = bp;
 	return 0;
@@ -661,16 +658,19 @@ xfs_dir2_data_log_entry(
 void
 xfs_dir2_data_log_header(
 	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
 	struct xfs_buf		*bp)
 {
-	xfs_dir2_data_hdr_t	*hdr = bp->b_addr;
+#ifdef DEBUG
+	struct xfs_dir2_data_hdr *hdr = bp->b_addr;
 
 	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
 	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
 	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
 	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+#endif
 
-	xfs_trans_log_buf(tp, bp, 0, xfs_dir3_data_entry_offset(hdr) - 1);
+	xfs_trans_log_buf(tp, bp, 0, dp->d_ops->data_entry_offset() - 1);
 }
 
 /*
@@ -711,6 +711,7 @@ xfs_dir2_data_log_unused(
 void
 xfs_dir2_data_make_free(
 	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
 	struct xfs_buf		*bp,
 	xfs_dir2_data_aoff_t	offset,		/* starting byte offset */
 	xfs_dir2_data_aoff_t	len,		/* length in bytes */
@@ -748,7 +749,7 @@ xfs_dir2_data_make_free(
 	 * If this isn't the start of the block, then back up to
 	 * the previous entry and see if it's free.
 	 */
-	if (offset > xfs_dir3_data_entry_offset(hdr)) {
+	if (offset > dp->d_ops->data_entry_offset()) {
 		__be16			*tagp;	/* tag just before us */
 
 		tagp = (__be16 *)((char *)hdr + offset) - 1;
@@ -774,15 +775,15 @@ xfs_dir2_data_make_free(
 	 * Previous and following entries are both free,
 	 * merge everything into a single free entry.
 	 */
-	bf = xfs_dir3_data_bestfree_p(hdr);
+	bf = dp->d_ops->data_bestfree_p(hdr);
 	if (prevdup && postdup) {
 		xfs_dir2_data_free_t	*dfp2;	/* another bestfree pointer */
 
 		/*
 		 * See if prevdup and/or postdup are in bestfree table.
 		 */
-		dfp = xfs_dir2_data_freefind(hdr, prevdup);
-		dfp2 = xfs_dir2_data_freefind(hdr, postdup);
+		dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
+		dfp2 = xfs_dir2_data_freefind(hdr, bf, postdup);
 		/*
 		 * We need a rescan unless there are exactly 2 free entries
 		 * namely our two.  Then we know what's happening, otherwise
@@ -810,12 +811,13 @@ xfs_dir2_data_make_free(
 				ASSERT(dfp2 == dfp);
 				dfp2 = &bf[1];
 			}
-			xfs_dir2_data_freeremove(hdr, dfp2, needlogp);
-			xfs_dir2_data_freeremove(hdr, dfp, needlogp);
+			xfs_dir2_data_freeremove(hdr, bf, dfp2, needlogp);
+			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
 			/*
 			 * Now insert the new entry.
 			 */
-			dfp = xfs_dir2_data_freeinsert(hdr, prevdup, needlogp);
+			dfp = xfs_dir2_data_freeinsert(hdr, bf, prevdup,
+						       needlogp);
 			ASSERT(dfp == &bf[0]);
 			ASSERT(dfp->length == prevdup->length);
 			ASSERT(!dfp[1].length);
@@ -826,7 +828,7 @@ xfs_dir2_data_make_free(
 	 * The entry before us is free, merge with it.
 	 */
 	else if (prevdup) {
-		dfp = xfs_dir2_data_freefind(hdr, prevdup);
+		dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
 		be16_add_cpu(&prevdup->length, len);
 		*xfs_dir2_data_unused_tag_p(prevdup) =
 			cpu_to_be16((char *)prevdup - (char *)hdr);
@@ -837,8 +839,8 @@ xfs_dir2_data_make_free(
 		 * the old one and add the new one.
 		 */
 		if (dfp) {
-			xfs_dir2_data_freeremove(hdr, dfp, needlogp);
-			xfs_dir2_data_freeinsert(hdr, prevdup, needlogp);
+			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+			xfs_dir2_data_freeinsert(hdr, bf, prevdup, needlogp);
 		}
 		/*
 		 * Otherwise we need a scan if the new entry is big enough.
@@ -852,7 +854,7 @@ xfs_dir2_data_make_free(
 	 * The following entry is free, merge with it.
 	 */
 	else if (postdup) {
-		dfp = xfs_dir2_data_freefind(hdr, postdup);
+		dfp = xfs_dir2_data_freefind(hdr, bf, postdup);
 		newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
 		newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
 		newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length));
@@ -865,8 +867,8 @@ xfs_dir2_data_make_free(
 		 * the old one and add the new one.
 		 */
 		if (dfp) {
-			xfs_dir2_data_freeremove(hdr, dfp, needlogp);
-			xfs_dir2_data_freeinsert(hdr, newdup, needlogp);
+			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+			xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
 		}
 		/*
 		 * Otherwise we need a scan if the new entry is big enough.
@@ -886,7 +888,7 @@ xfs_dir2_data_make_free(
 		*xfs_dir2_data_unused_tag_p(newdup) =
 			cpu_to_be16((char *)newdup - (char *)hdr);
 		xfs_dir2_data_log_unused(tp, bp, newdup);
-		xfs_dir2_data_freeinsert(hdr, newdup, needlogp);
+		xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
 	}
 	*needscanp = needscan;
 }
@@ -897,6 +899,7 @@ xfs_dir2_data_make_free(
 void
 xfs_dir2_data_use_free(
 	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
 	struct xfs_buf		*bp,
 	xfs_dir2_data_unused_t	*dup,		/* unused entry */
 	xfs_dir2_data_aoff_t	offset,		/* starting offset to use */
@@ -926,9 +929,9 @@ xfs_dir2_data_use_free(
 	/*
 	 * Look up the entry in the bestfree table.
 	 */
-	dfp = xfs_dir2_data_freefind(hdr, dup);
 	oldlen = be16_to_cpu(dup->length);
-	bf = xfs_dir3_data_bestfree_p(hdr);
+	bf = dp->d_ops->data_bestfree_p(hdr);
+	dfp = xfs_dir2_data_freefind(hdr, bf, dup);
 	ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length));
 	/*
 	 * Check for alignment with front and back of the entry.
@@ -945,7 +948,8 @@ xfs_dir2_data_use_free(
 		if (dfp) {
 			needscan = (bf[2].offset != 0);
 			if (!needscan)
-				xfs_dir2_data_freeremove(hdr, dfp, needlogp);
+				xfs_dir2_data_freeremove(hdr, bf, dfp,
+							 needlogp);
 		}
 	}
 	/*
@@ -963,8 +967,9 @@ xfs_dir2_data_use_free(
 		 * If it was in the table, remove it and add the new one.
 		 */
 		if (dfp) {
-			xfs_dir2_data_freeremove(hdr, dfp, needlogp);
-			dfp = xfs_dir2_data_freeinsert(hdr, newdup, needlogp);
+			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+			dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
+						       needlogp);
 			ASSERT(dfp != NULL);
 			ASSERT(dfp->length == newdup->length);
 			ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
@@ -990,8 +995,9 @@ xfs_dir2_data_use_free(
 		 * If it was in the table, remove it and add the new one.
 		 */
 		if (dfp) {
-			xfs_dir2_data_freeremove(hdr, dfp, needlogp);
-			dfp = xfs_dir2_data_freeinsert(hdr, newdup, needlogp);
+			xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+			dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
+						       needlogp);
 			ASSERT(dfp != NULL);
 			ASSERT(dfp->length == newdup->length);
 			ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
@@ -1030,9 +1036,11 @@ xfs_dir2_data_use_free(
 		if (dfp) {
 			needscan = (bf[2].length != 0);
 			if (!needscan) {
-				xfs_dir2_data_freeremove(hdr, dfp, needlogp);
-				xfs_dir2_data_freeinsert(hdr, newdup, needlogp);
-				xfs_dir2_data_freeinsert(hdr, newdup2,
+				xfs_dir2_data_freeremove(hdr, bf, dfp,
+							 needlogp);
+				xfs_dir2_data_freeinsert(hdr, bf, newdup,
+							 needlogp);
+				xfs_dir2_data_freeinsert(hdr, bf, newdup2,
 							 needlogp);
 			}
 		}
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index dd195363ccf2..f35b3942c1a6 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -460,7 +460,7 @@ xfs_dir2_block_to_leaf(
 	xfs_dir3_data_check(dp, dbp);
 	btp = xfs_dir2_block_tail_p(mp, hdr);
 	blp = xfs_dir2_block_leaf_p(btp);
-	bf = xfs_dir3_data_bestfree_p(hdr);
+	bf = dp->d_ops->data_bestfree_p(hdr);
 	ents = xfs_dir3_leaf_ents_p(leaf);
 
 	/*
@@ -484,7 +484,7 @@ xfs_dir2_block_to_leaf(
 	 * Make the space formerly occupied by the leaf entries and block
 	 * tail be free.
 	 */
-	xfs_dir2_data_make_free(tp, dbp,
+	xfs_dir2_data_make_free(tp, dp, dbp,
 		(xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
 		(xfs_dir2_data_aoff_t)((char *)hdr + mp->m_dirblksize -
 				       (char *)blp),
@@ -512,7 +512,7 @@ xfs_dir2_block_to_leaf(
 	 * Log the data header and leaf bests table.
 	 */
 	if (needlog)
-		xfs_dir2_data_log_header(tp, dbp);
+		xfs_dir2_data_log_header(tp, dp, dbp);
 	xfs_dir3_leaf_check(mp, lbp);
 	xfs_dir3_data_check(dp, dbp);
 	xfs_dir3_leaf_log_bests(tp, lbp, 0, 0);
@@ -862,7 +862,7 @@ xfs_dir2_leaf_addname(
 		else
 			xfs_dir3_leaf_log_bests(tp, lbp, use_block, use_block);
 		hdr = dbp->b_addr;
-		bf = xfs_dir3_data_bestfree_p(hdr);
+		bf = dp->d_ops->data_bestfree_p(hdr);
 		bestsp[use_block] = bf[0].length;
 		grown = 1;
 	} else {
@@ -878,7 +878,7 @@ xfs_dir2_leaf_addname(
 			return error;
 		}
 		hdr = dbp->b_addr;
-		bf = xfs_dir3_data_bestfree_p(hdr);
+		bf = dp->d_ops->data_bestfree_p(hdr);
 		grown = 0;
 	}
 	/*
@@ -891,7 +891,7 @@ xfs_dir2_leaf_addname(
 	/*
 	 * Mark the initial part of our freespace in use for the new entry.
 	 */
-	xfs_dir2_data_use_free(tp, dbp, dup,
+	xfs_dir2_data_use_free(tp, dp, dbp, dup,
 		(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
 		&needlog, &needscan);
 	/*
@@ -913,7 +913,7 @@ xfs_dir2_leaf_addname(
 	 * Need to log the data block's header.
 	 */
 	if (needlog)
-		xfs_dir2_data_log_header(tp, dbp);
+		xfs_dir2_data_log_header(tp, dp, dbp);
 	xfs_dir2_data_log_entry(tp, dp, dbp, dep);
 	/*
 	 * If the bests table needs to be changed, do it.
@@ -1413,7 +1413,7 @@ xfs_dir2_leaf_removename(
 	leaf = lbp->b_addr;
 	hdr = dbp->b_addr;
 	xfs_dir3_data_check(dp, dbp);
-	bf = xfs_dir3_data_bestfree_p(hdr);
+	bf = dp->d_ops->data_bestfree_p(hdr);
 	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
 	ents = xfs_dir3_leaf_ents_p(leaf);
 	/*
@@ -1431,7 +1431,7 @@ xfs_dir2_leaf_removename(
 	/*
 	 * Mark the former data entry unused.
 	 */
-	xfs_dir2_data_make_free(tp, dbp,
+	xfs_dir2_data_make_free(tp, dp, dbp,
 		(xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
 		dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
 	/*
@@ -1451,7 +1451,7 @@ xfs_dir2_leaf_removename(
 	if (needscan)
 		xfs_dir2_data_freescan(dp, hdr, &needlog);
 	if (needlog)
-		xfs_dir2_data_log_header(tp, dbp);
+		xfs_dir2_data_log_header(tp, dp, dbp);
 	/*
 	 * If the longest freespace in the data block has changed,
 	 * put the new value in the bests table and log that.
@@ -1465,7 +1465,7 @@ xfs_dir2_leaf_removename(
 	 * If the data block is now empty then get rid of the data block.
 	 */
 	if (be16_to_cpu(bf[0].length) ==
-			mp->m_dirblksize - xfs_dir3_data_entry_offset(hdr)) {
+			mp->m_dirblksize - dp->d_ops->data_entry_offset()) {
 		ASSERT(db != mp->m_dirdatablk);
 		if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
 			/*
@@ -1659,12 +1659,12 @@ xfs_dir2_leaf_trim_data(
 #ifdef DEBUG
 {
 	struct xfs_dir2_data_hdr *hdr = dbp->b_addr;
-	struct xfs_dir2_data_free *bf = xfs_dir3_data_bestfree_p(hdr);
+	struct xfs_dir2_data_free *bf = dp->d_ops->data_bestfree_p(hdr);
 
 	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
 	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
 	ASSERT(be16_to_cpu(bf[0].length) ==
-	       mp->m_dirblksize - xfs_dir3_data_entry_offset(hdr));
+	       mp->m_dirblksize - dp->d_ops->data_entry_offset());
 	ASSERT(db == be32_to_cpu(ltp->bestcount) - 1);
 }
 #endif
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index a1d133981c14..9cfc00323b19 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -1255,10 +1255,10 @@ xfs_dir2_leafn_remove(
 	dbp = dblk->bp;
 	hdr = dbp->b_addr;
 	dep = (xfs_dir2_data_entry_t *)((char *)hdr + off);
-	bf = xfs_dir3_data_bestfree_p(hdr);
+	bf = dp->d_ops->data_bestfree_p(hdr);
 	longest = be16_to_cpu(bf[0].length);
 	needlog = needscan = 0;
-	xfs_dir2_data_make_free(tp, dbp, off,
+	xfs_dir2_data_make_free(tp, dp, dbp, off,
 		dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
 	/*
 	 * Rescan the data block freespaces for bestfree.
@@ -1267,7 +1267,7 @@ xfs_dir2_leafn_remove(
 	if (needscan)
 		xfs_dir2_data_freescan(dp, hdr, &needlog);
 	if (needlog)
-		xfs_dir2_data_log_header(tp, dbp);
+		xfs_dir2_data_log_header(tp, dp, dbp);
 	xfs_dir3_data_check(dp, dbp);
 	/*
 	 * If the longest data block freespace changes, need to update
@@ -1308,7 +1308,7 @@ xfs_dir2_leafn_remove(
 		 * (usually).
 		 */
 		if (longest == mp->m_dirblksize -
-			       xfs_dir3_data_entry_offset(hdr)) {
+			       dp->d_ops->data_entry_offset()) {
 			/*
 			 * Try to punch out the data block.
 			 */
@@ -1962,7 +1962,7 @@ xfs_dir2_node_addname_int(
 		 * change again.
 		 */
 		hdr = dbp->b_addr;
-		bf = xfs_dir3_data_bestfree_p(hdr);
+		bf = dp->d_ops->data_bestfree_p(hdr);
 		bests[findex] = bf[0].length;
 		logfree = 1;
 	}
@@ -1984,7 +1984,7 @@ xfs_dir2_node_addname_int(
 		if (error)
 			return error;
 		hdr = dbp->b_addr;
-		bf = xfs_dir3_data_bestfree_p(hdr);
+		bf = dp->d_ops->data_bestfree_p(hdr);
 		logfree = 0;
 	}
 	ASSERT(be16_to_cpu(bf[0].length) >= length);
@@ -1997,7 +1997,7 @@ xfs_dir2_node_addname_int(
 	/*
 	 * Mark the first part of the unused space, inuse for us.
 	 */
-	xfs_dir2_data_use_free(tp, dbp, dup,
+	xfs_dir2_data_use_free(tp, dp, dbp, dup,
 		(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
 		&needlog, &needscan);
 	/*
@@ -2020,7 +2020,7 @@ xfs_dir2_node_addname_int(
 	 * Log the data block header if needed.
 	 */
 	if (needlog)
-		xfs_dir2_data_log_header(tp, dbp);
+		xfs_dir2_data_log_header(tp, dp, dbp);
 	/*
 	 * If the freespace entry is now wrong, update it.
 	 */
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 1bad84c40829..9a69f206daf9 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -59,7 +59,8 @@ extern int xfs_dir3_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
 
 extern struct xfs_dir2_data_free *
 xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
-		struct xfs_dir2_data_unused *dup, int *loghead);
+		struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup,
+		int *loghead);
 extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
 		struct xfs_buf **bpp);
 
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index b99aa7d48e90..f45ecac821ec 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -213,7 +213,7 @@ xfs_dir2_block_getdents(
 	 * Set up values for the loop.
 	 */
 	btp = xfs_dir2_block_tail_p(mp, hdr);
-	ptr = (char *)xfs_dir3_data_entry_p(hdr);
+	ptr = (char *)dp->d_ops->data_entry_p(hdr);
 	endptr = (char *)xfs_dir2_block_leaf_p(btp);
 
 	/*
@@ -578,13 +578,13 @@ xfs_dir2_leaf_getdents(
 			/*
 			 * Find our position in the block.
 			 */
-			ptr = (char *)xfs_dir3_data_entry_p(hdr);
+			ptr = (char *)dp->d_ops->data_entry_p(hdr);
 			byteoff = xfs_dir2_byte_to_off(mp, curoff);
 			/*
 			 * Skip past the header.
 			 */
 			if (byteoff == 0)
-				curoff += xfs_dir3_data_entry_offset(hdr);
+				curoff += dp->d_ops->data_entry_offset();
 			/*
 			 * Skip past entries until we reach our offset.
 			 */
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 953e19479151..42bbcff15cc5 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -210,7 +210,7 @@ xfs_dir2_block_to_sf(
 	 * Set up to loop over the block's entries.
 	 */
 	btp = xfs_dir2_block_tail_p(mp, hdr);
-	ptr = (char *)xfs_dir3_data_entry_p(hdr);
+	ptr = (char *)dp->d_ops->data_entry_p(hdr);
 	endptr = (char *)xfs_dir2_block_leaf_p(btp);
 	sfep = xfs_dir2_sf_firstentry(sfp);
 	/*
-- 
cgit v1.3


From 4141956ae05e0685b14b30f92fdc8fb11b4a0cb2 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 29 Oct 2013 22:11:50 +1100
Subject: xfs: vectorise directory leaf operations

Next step in the vectorisation process is the leaf block
encode/decode operations. Most of the operations on leaves are
handled by the data block vectors, so there are relatively few of
them here.

Because of all the shuffling of code and having to pass more state
to some functions, this patch doesn't directly reduce the size of
the binary. It does open up many more opportunities for factoring
and optimisation, however.

   text    data     bss     dec     hex filename
 794490   96802    1096  892388   d9de4 fs/xfs/xfs.o.orig
 792986   96802    1096  890884   d9804 fs/xfs/xfs.o.p1
 792350   96802    1096  890248   d9588 fs/xfs/xfs.o.p2
 789293   96802    1096  887191   d8997 fs/xfs/xfs.o.p3
 789005   96802    1096  886903   d8997 fs/xfs/xfs.o.p4
 789061   96802    1096  886959   d88af fs/xfs/xfs.o.p5

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_da_btree.c   |  19 ++++----
 fs/xfs/xfs_da_format.c  |  75 +++++++++++++++++++++++++++++++
 fs/xfs/xfs_da_format.h  |  30 -------------
 fs/xfs/xfs_dir2.c       |   7 +--
 fs/xfs/xfs_dir2.h       |  10 +++--
 fs/xfs/xfs_dir2_block.c |   2 +-
 fs/xfs/xfs_dir2_data.c  |  13 ++----
 fs/xfs/xfs_dir2_leaf.c  |  91 ++++++++++++++++++++-----------------
 fs/xfs/xfs_dir2_node.c  | 117 ++++++++++++++++++++++++++----------------------
 fs/xfs/xfs_dir2_priv.h  |  13 +++---
 10 files changed, 218 insertions(+), 159 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index eb65c546ffd8..df3759c51470 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -605,7 +605,7 @@ xfs_da3_root_split(
 
 		leaf = (xfs_dir2_leaf_t *)oldroot;
 		xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
-		ents = xfs_dir3_leaf_ents_p(leaf);
+		ents = dp->d_ops->leaf_ents_p(leaf);
 
 		ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
 		       leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
@@ -1319,7 +1319,8 @@ xfs_da3_fixhashpath(
 			return;
 		break;
 	case XFS_DIR2_LEAFN_MAGIC:
-		lasthash = xfs_dir2_leafn_lasthash(blk->bp, &count);
+		lasthash = xfs_dir2_leafn_lasthash(state->args->dp,
+						   blk->bp, &count);
 		if (count == 0)
 			return;
 		break;
@@ -1536,7 +1537,8 @@ xfs_da3_node_lookup_int(
 		if (blk->magic == XFS_DIR2_LEAFN_MAGIC ||
 		    blk->magic == XFS_DIR3_LEAFN_MAGIC) {
 			blk->magic = XFS_DIR2_LEAFN_MAGIC;
-			blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL);
+			blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
+							       blk->bp, NULL);
 			break;
 		}
 
@@ -1702,7 +1704,7 @@ xfs_da3_blk_link(
 		before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
 		break;
 	case XFS_DIR2_LEAFN_MAGIC:
-		before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp);
+		before = xfs_dir2_leafn_order(args->dp, old_blk->bp, new_blk->bp);
 		break;
 	case XFS_DA_NODE_MAGIC:
 		before = xfs_da3_node_order(old_blk->bp, new_blk->bp);
@@ -1947,16 +1949,15 @@ xfs_da3_path_shift(
 			blk->magic = XFS_ATTR_LEAF_MAGIC;
 			ASSERT(level == path->active-1);
 			blk->index = 0;
-			blk->hashval = xfs_attr_leaf_lasthash(blk->bp,
-							      NULL);
+			blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
 			break;
 		case XFS_DIR2_LEAFN_MAGIC:
 		case XFS_DIR3_LEAFN_MAGIC:
 			blk->magic = XFS_DIR2_LEAFN_MAGIC;
 			ASSERT(level == path->active-1);
 			blk->index = 0;
-			blk->hashval = xfs_dir2_leafn_lasthash(blk->bp,
-							       NULL);
+			blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
+							       blk->bp, NULL);
 			break;
 		default:
 			ASSERT(0);
@@ -2223,7 +2224,7 @@ xfs_da3_swap_lastblock(
 
 		dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
 		xfs_dir3_leaf_hdr_from_disk(&leafhdr, dead_leaf2);
-		ents = xfs_dir3_leaf_ents_p(dead_leaf2);
+		ents = ip->d_ops->leaf_ents_p(dead_leaf2);
 		dead_level = 0;
 		dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval);
 	} else {
diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/xfs_da_format.c
index 1e92a241fd52..ff8b50368c94 100644
--- a/fs/xfs/xfs_da_format.c
+++ b/fs/xfs/xfs_da_format.c
@@ -435,6 +435,48 @@ xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
 		((char *)hdr + xfs_dir3_data_entry_offset());
 }
 
+
+/*
+ * Directory Leaf block operations
+ */
+static int
+xfs_dir2_leaf_hdr_size(void)
+{
+	return sizeof(struct xfs_dir2_leaf_hdr);
+}
+
+static int
+xfs_dir2_max_leaf_ents(struct xfs_mount *mp)
+{
+	return (mp->m_dirblksize - xfs_dir2_leaf_hdr_size()) /
+		(uint)sizeof(struct xfs_dir2_leaf_entry);
+}
+
+static struct xfs_dir2_leaf_entry *
+xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp)
+{
+	return lp->__ents;
+}
+
+static int
+xfs_dir3_leaf_hdr_size(void)
+{
+	return sizeof(struct xfs_dir3_leaf_hdr);
+}
+
+static inline int
+xfs_dir3_max_leaf_ents(struct xfs_mount *mp)
+{
+	return (mp->m_dirblksize - xfs_dir3_leaf_hdr_size()) /
+		(uint)sizeof(struct xfs_dir2_leaf_entry);
+}
+
+static inline struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp)
+{
+	return ((struct xfs_dir3_leaf *)lp)->__ents;
+}
+
 const struct xfs_dir_ops xfs_dir2_ops = {
 	.sf_entsize = xfs_dir2_sf_entsize,
 	.sf_nextentry = xfs_dir2_sf_nextentry,
@@ -462,6 +504,10 @@ const struct xfs_dir_ops xfs_dir2_ops = {
 	.data_entry_p = xfs_dir2_data_entry_p,
 	.data_unused_p = xfs_dir2_data_unused_p,
 
+	.leaf_hdr_size = xfs_dir2_leaf_hdr_size,
+	.leaf_max_ents = xfs_dir2_max_leaf_ents,
+	.leaf_ents_p = xfs_dir2_leaf_ents_p,
+
 };
 
 const struct xfs_dir_ops xfs_dir2_ftype_ops = {
@@ -490,6 +536,10 @@ const struct xfs_dir_ops xfs_dir2_ftype_ops = {
 	.data_first_entry_p = xfs_dir2_data_first_entry_p,
 	.data_entry_p = xfs_dir2_data_entry_p,
 	.data_unused_p = xfs_dir2_data_unused_p,
+
+	.leaf_hdr_size = xfs_dir2_leaf_hdr_size,
+	.leaf_max_ents = xfs_dir2_max_leaf_ents,
+	.leaf_ents_p = xfs_dir2_leaf_ents_p,
 };
 
 const struct xfs_dir_ops xfs_dir3_ops = {
@@ -518,4 +568,29 @@ const struct xfs_dir_ops xfs_dir3_ops = {
 	.data_first_entry_p = xfs_dir3_data_first_entry_p,
 	.data_entry_p = xfs_dir3_data_entry_p,
 	.data_unused_p = xfs_dir3_data_unused_p,
+
+	.leaf_hdr_size = xfs_dir3_leaf_hdr_size,
+	.leaf_max_ents = xfs_dir3_max_leaf_ents,
+	.leaf_ents_p = xfs_dir3_leaf_ents_p,
 };
+
+/*
+ * Return the ops structure according to the current config.  If we are passed
+ * an inode, then that overrides the default config we use which is based on
+ * feature bits.
+ */
+const struct xfs_dir_ops *
+xfs_dir_get_ops(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*dp)
+{
+	if (dp)
+		return dp->d_ops;
+	if (mp->m_dir_inode_ops)
+		return mp->m_dir_inode_ops;
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		return &xfs_dir3_ops;
+	if (xfs_sb_version_hasftype(&mp->m_sb))
+		return &xfs_dir2_ftype_ops;
+	return &xfs_dir2_ops;
+}
diff --git a/fs/xfs/xfs_da_format.h b/fs/xfs/xfs_da_format.h
index aeca0038a013..0a567e25cf59 100644
--- a/fs/xfs/xfs_da_format.h
+++ b/fs/xfs/xfs_da_format.h
@@ -551,36 +551,6 @@ struct xfs_dir3_leaf {
 extern void xfs_dir3_leaf_hdr_from_disk(struct xfs_dir3_icleaf_hdr *to,
 					struct xfs_dir2_leaf *from);
 
-static inline int
-xfs_dir3_leaf_hdr_size(struct xfs_dir2_leaf *lp)
-{
-	if (lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
-	    lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC))
-		return sizeof(struct xfs_dir3_leaf_hdr);
-	return sizeof(struct xfs_dir2_leaf_hdr);
-}
-
-static inline int
-xfs_dir3_max_leaf_ents(struct xfs_mount *mp, struct xfs_dir2_leaf *lp)
-{
-	return (mp->m_dirblksize - xfs_dir3_leaf_hdr_size(lp)) /
-		(uint)sizeof(struct xfs_dir2_leaf_entry);
-}
-
-/*
- * Get address of the bestcount field in the single-leaf block.
- */
-static inline struct xfs_dir2_leaf_entry *
-xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp)
-{
-	if (lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
-	    lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
-		struct xfs_dir3_leaf *lp3 = (struct xfs_dir3_leaf *)lp;
-		return lp3->__ents;
-	}
-	return lp->__ents;
-}
-
 /*
  * Get address of the bestcount field in the single-leaf block.
  */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 7911136453cd..2b98a33ca383 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -113,12 +113,7 @@ xfs_dir_mount(
 	else
 		mp->m_dirnameops = &xfs_default_nameops;
 
-	if (xfs_sb_version_hascrc(&mp->m_sb))
-		mp->m_dir_inode_ops = &xfs_dir3_ops;
-	else if (xfs_sb_version_hasftype(&mp->m_sb))
-		mp->m_dir_inode_ops = &xfs_dir2_ftype_ops;
-	else
-		mp->m_dir_inode_ops = &xfs_dir2_ops;
+	mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL);
 }
 
 /*
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 076010a1376a..9ba9db731990 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -74,11 +74,15 @@ struct xfs_dir_ops {
 		(*data_entry_p)(struct xfs_dir2_data_hdr *hdr);
 	struct xfs_dir2_data_unused *
 		(*data_unused_p)(struct xfs_dir2_data_hdr *hdr);
+
+	int	(*leaf_hdr_size)(void);
+	int	(*leaf_max_ents)(struct xfs_mount *mp);
+	struct xfs_dir2_leaf_entry *
+		(*leaf_ents_p)(struct xfs_dir2_leaf *lp);
 };
 
-extern const struct xfs_dir_ops xfs_dir2_ops;
-extern const struct xfs_dir_ops xfs_dir2_ftype_ops;
-extern const struct xfs_dir_ops xfs_dir3_ops;
+extern const struct xfs_dir_ops *
+	xfs_dir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
 
 /*
  * Generic directory interface routines
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index f92ca5699342..75f815786b97 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -937,7 +937,7 @@ xfs_dir2_leaf_to_block(
 	mp = dp->i_mount;
 	leaf = lbp->b_addr;
 	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
-	ents = xfs_dir3_leaf_ents_p(leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
 	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
 
 	ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC ||
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 13fa0d6b4ff7..32d657fbe6a4 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -68,17 +68,10 @@ __xfs_dir3_data_check(
 	hdr = bp->b_addr;
 
 	/*
-	 * We can be passed a null dp here from a verifier, so manually
-	 * configure the ops here in that case.
+	 * We can be passed a null dp here from a verifier, so we need to go the
+	 * hard way to get them.
 	 */
-	if (dp)
-		ops = dp->d_ops;
-	else if (xfs_sb_version_hascrc(&mp->m_sb))
-		ops = &xfs_dir3_ops;
-	else if (xfs_sb_version_hasftype(&mp->m_sb))
-		ops = &xfs_dir2_ftype_ops;
-	else
-		ops = &xfs_dir2_ops;
+	ops = xfs_dir_get_ops(mp, dp);
 
 	switch (hdr->magic) {
 	case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index f35b3942c1a6..16fdc0e627a7 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -50,15 +50,15 @@ static void xfs_dir3_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp);
  * Pop an assert if something is wrong.
  */
 #ifdef DEBUG
-#define	xfs_dir3_leaf_check(mp, bp) \
+#define	xfs_dir3_leaf_check(dp, bp) \
 do { \
-	if (!xfs_dir3_leaf1_check((mp), (bp))) \
+	if (!xfs_dir3_leaf1_check((dp), (bp))) \
 		ASSERT(0); \
 } while (0);
 
 STATIC bool
 xfs_dir3_leaf1_check(
-	struct xfs_mount	*mp,
+	struct xfs_inode	*dp,
 	struct xfs_buf		*bp)
 {
 	struct xfs_dir2_leaf	*leaf = bp->b_addr;
@@ -73,10 +73,10 @@ xfs_dir3_leaf1_check(
 	} else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC)
 		return false;
 
-	return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf);
+	return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
 }
 #else
-#define	xfs_dir3_leaf_check(mp, bp)
+#define	xfs_dir3_leaf_check(dp, bp)
 #endif
 
 void
@@ -138,6 +138,7 @@ xfs_dir3_leaf_hdr_to_disk(
 bool
 xfs_dir3_leaf_check_int(
 	struct xfs_mount	*mp,
+	struct xfs_inode	*dp,
 	struct xfs_dir3_icleaf_hdr *hdr,
 	struct xfs_dir2_leaf	*leaf)
 {
@@ -145,8 +146,15 @@ xfs_dir3_leaf_check_int(
 	xfs_dir2_leaf_tail_t	*ltp;
 	int			stale;
 	int			i;
+	const struct xfs_dir_ops *ops;
 
-	ents = xfs_dir3_leaf_ents_p(leaf);
+	/*
+	 * we can be passed a null dp here from a verifier, so we need to go the
+	 * hard way to get them.
+	 */
+	ops = xfs_dir_get_ops(mp, dp);
+
+	ents = ops->leaf_ents_p(leaf);
 	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
 
 	/*
@@ -154,7 +162,7 @@ xfs_dir3_leaf_check_int(
 	 * Should factor in the size of the bests table as well.
 	 * We can deduce a value for that from di_size.
 	 */
-	if (hdr->count > xfs_dir3_max_leaf_ents(mp, leaf))
+	if (hdr->count > ops->leaf_max_ents(mp))
 		return false;
 
 	/* Leaves and bests don't overlap in leaf format. */
@@ -213,7 +221,7 @@ xfs_dir3_leaf_verify(
 	}
 
 	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
-	return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf);
+	return xfs_dir3_leaf_check_int(mp, NULL, &leafhdr, leaf);
 }
 
 static void
@@ -399,7 +407,7 @@ xfs_dir3_leaf_get_buf(
 		return error;
 
 	xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic);
-	xfs_dir3_leaf_log_header(tp, bp);
+	xfs_dir3_leaf_log_header(tp, dp, bp);
 	if (magic == XFS_DIR2_LEAF1_MAGIC)
 		xfs_dir3_leaf_log_tail(tp, bp);
 	*bpp = bp;
@@ -461,7 +469,7 @@ xfs_dir2_block_to_leaf(
 	btp = xfs_dir2_block_tail_p(mp, hdr);
 	blp = xfs_dir2_block_leaf_p(btp);
 	bf = dp->d_ops->data_bestfree_p(hdr);
-	ents = xfs_dir3_leaf_ents_p(leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
 
 	/*
 	 * Set the counts in the leaf header.
@@ -470,14 +478,14 @@ xfs_dir2_block_to_leaf(
 	leafhdr.count = be32_to_cpu(btp->count);
 	leafhdr.stale = be32_to_cpu(btp->stale);
 	xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
-	xfs_dir3_leaf_log_header(tp, lbp);
+	xfs_dir3_leaf_log_header(tp, dp, lbp);
 
 	/*
 	 * Could compact these but I think we always do the conversion
 	 * after squeezing out stale entries.
 	 */
 	memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t));
-	xfs_dir3_leaf_log_ents(tp, lbp, 0, leafhdr.count - 1);
+	xfs_dir3_leaf_log_ents(tp, dp, lbp, 0, leafhdr.count - 1);
 	needscan = 0;
 	needlog = 1;
 	/*
@@ -513,7 +521,7 @@ xfs_dir2_block_to_leaf(
 	 */
 	if (needlog)
 		xfs_dir2_data_log_header(tp, dp, dbp);
-	xfs_dir3_leaf_check(mp, lbp);
+	xfs_dir3_leaf_check(dp, lbp);
 	xfs_dir3_data_check(dp, dbp);
 	xfs_dir3_leaf_log_bests(tp, lbp, 0, 0);
 	return 0;
@@ -697,7 +705,7 @@ xfs_dir2_leaf_addname(
 	index = xfs_dir2_leaf_search_hash(args, lbp);
 	leaf = lbp->b_addr;
 	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
-	ents = xfs_dir3_leaf_ents_p(leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
 	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
 	bestsp = xfs_dir2_leaf_bests_p(ltp);
 	length = dp->d_ops->data_entsize(args->namelen);
@@ -938,9 +946,9 @@ xfs_dir2_leaf_addname(
 	 * Log the leaf fields and give up the buffers.
 	 */
 	xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
-	xfs_dir3_leaf_log_header(tp, lbp);
-	xfs_dir3_leaf_log_ents(tp, lbp, lfloglow, lfloghigh);
-	xfs_dir3_leaf_check(mp, lbp);
+	xfs_dir3_leaf_log_header(tp, dp, lbp);
+	xfs_dir3_leaf_log_ents(tp, dp, lbp, lfloglow, lfloghigh);
+	xfs_dir3_leaf_check(dp, lbp);
 	xfs_dir3_data_check(dp, dbp);
 	return 0;
 }
@@ -968,7 +976,7 @@ xfs_dir3_leaf_compact(
 	/*
 	 * Compress out the stale entries in place.
 	 */
-	ents = xfs_dir3_leaf_ents_p(leaf);
+	ents = args->dp->d_ops->leaf_ents_p(leaf);
 	for (from = to = 0, loglow = -1; from < leafhdr->count; from++) {
 		if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
 			continue;
@@ -990,9 +998,10 @@ xfs_dir3_leaf_compact(
 	leafhdr->stale = 0;
 
 	xfs_dir3_leaf_hdr_to_disk(leaf, leafhdr);
-	xfs_dir3_leaf_log_header(args->trans, bp);
+	xfs_dir3_leaf_log_header(args->trans, args->dp, bp);
 	if (loglow != -1)
-		xfs_dir3_leaf_log_ents(args->trans, bp, loglow, to - 1);
+		xfs_dir3_leaf_log_ents(args->trans, args->dp, bp,
+				       loglow, to - 1);
 }
 
 /*
@@ -1119,10 +1128,11 @@ xfs_dir3_leaf_log_bests(
  */
 void
 xfs_dir3_leaf_log_ents(
-	xfs_trans_t		*tp,		/* transaction pointer */
-	struct xfs_buf		*bp,		/* leaf buffer */
-	int			first,		/* first entry to log */
-	int			last)		/* last entry to log */
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	struct xfs_buf		*bp,
+	int			first,
+	int			last)
 {
 	xfs_dir2_leaf_entry_t	*firstlep;	/* pointer to first entry */
 	xfs_dir2_leaf_entry_t	*lastlep;	/* pointer to last entry */
@@ -1134,7 +1144,7 @@ xfs_dir3_leaf_log_ents(
 	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
 	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
 
-	ents = xfs_dir3_leaf_ents_p(leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
 	firstlep = &ents[first];
 	lastlep = &ents[last];
 	xfs_trans_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf),
@@ -1147,6 +1157,7 @@ xfs_dir3_leaf_log_ents(
 void
 xfs_dir3_leaf_log_header(
 	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
 	struct xfs_buf		*bp)
 {
 	struct xfs_dir2_leaf	*leaf = bp->b_addr;
@@ -1157,7 +1168,7 @@ xfs_dir3_leaf_log_header(
 	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
 
 	xfs_trans_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf),
-			  xfs_dir3_leaf_hdr_size(leaf) - 1);
+			  dp->d_ops->leaf_hdr_size() - 1);
 }
 
 /*
@@ -1212,9 +1223,9 @@ xfs_dir2_leaf_lookup(
 	}
 	tp = args->trans;
 	dp = args->dp;
-	xfs_dir3_leaf_check(dp->i_mount, lbp);
+	xfs_dir3_leaf_check(dp, lbp);
 	leaf = lbp->b_addr;
-	ents = xfs_dir3_leaf_ents_p(leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
 	/*
 	 * Get to the leaf entry and contained data entry address.
 	 */
@@ -1277,8 +1288,8 @@ xfs_dir2_leaf_lookup_int(
 
 	*lbpp = lbp;
 	leaf = lbp->b_addr;
-	xfs_dir3_leaf_check(mp, lbp);
-	ents = xfs_dir3_leaf_ents_p(leaf);
+	xfs_dir3_leaf_check(dp, lbp);
+	ents = dp->d_ops->leaf_ents_p(leaf);
 	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
 
 	/*
@@ -1415,7 +1426,7 @@ xfs_dir2_leaf_removename(
 	xfs_dir3_data_check(dp, dbp);
 	bf = dp->d_ops->data_bestfree_p(hdr);
 	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
-	ents = xfs_dir3_leaf_ents_p(leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
 	/*
 	 * Point to the leaf entry, use that to point to the data entry.
 	 */
@@ -1439,10 +1450,10 @@ xfs_dir2_leaf_removename(
 	 */
 	leafhdr.stale++;
 	xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
-	xfs_dir3_leaf_log_header(tp, lbp);
+	xfs_dir3_leaf_log_header(tp, dp, lbp);
 
 	lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
-	xfs_dir3_leaf_log_ents(tp, lbp, index, index);
+	xfs_dir3_leaf_log_ents(tp, dp, lbp, index, index);
 
 	/*
 	 * Scan the freespace in the data block again if necessary,
@@ -1476,7 +1487,7 @@ xfs_dir2_leaf_removename(
 			 */
 			if (error == ENOSPC && args->total == 0)
 				error = 0;
-			xfs_dir3_leaf_check(mp, lbp);
+			xfs_dir3_leaf_check(dp, lbp);
 			return error;
 		}
 		dbp = NULL;
@@ -1510,7 +1521,7 @@ xfs_dir2_leaf_removename(
 	else if (db != mp->m_dirdatablk)
 		dbp = NULL;
 
-	xfs_dir3_leaf_check(mp, lbp);
+	xfs_dir3_leaf_check(dp, lbp);
 	/*
 	 * See if we can convert to block form.
 	 */
@@ -1545,7 +1556,7 @@ xfs_dir2_leaf_replace(
 	}
 	dp = args->dp;
 	leaf = lbp->b_addr;
-	ents = xfs_dir3_leaf_ents_p(leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
 	/*
 	 * Point to the leaf entry, get data address from it.
 	 */
@@ -1564,7 +1575,7 @@ xfs_dir2_leaf_replace(
 	dp->d_ops->data_put_ftype(dep, args->filetype);
 	tp = args->trans;
 	xfs_dir2_data_log_entry(tp, dp, dbp, dep);
-	xfs_dir3_leaf_check(dp->i_mount, lbp);
+	xfs_dir3_leaf_check(dp, lbp);
 	xfs_trans_brelse(tp, lbp);
 	return 0;
 }
@@ -1590,7 +1601,7 @@ xfs_dir2_leaf_search_hash(
 	struct xfs_dir3_icleaf_hdr leafhdr;
 
 	leaf = lbp->b_addr;
-	ents = xfs_dir3_leaf_ents_p(leaf);
+	ents = args->dp->d_ops->leaf_ents_p(leaf);
 	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
 
 	/*
@@ -1830,10 +1841,10 @@ xfs_dir2_node_to_leaf(
 		freehdr.nvalid * sizeof(xfs_dir2_data_off_t));
 
 	xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
-	xfs_dir3_leaf_log_header(tp, lbp);
+	xfs_dir3_leaf_log_header(tp, dp, lbp);
 	xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
 	xfs_dir3_leaf_log_tail(tp, lbp);
-	xfs_dir3_leaf_check(mp, lbp);
+	xfs_dir3_leaf_check(dp, lbp);
 
 	/*
 	 * Get rid of the freespace block.
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 9cfc00323b19..3a426ed9b16e 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -54,15 +54,15 @@ static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
  * Check internal consistency of a leafn block.
  */
 #ifdef DEBUG
-#define	xfs_dir3_leaf_check(mp, bp) \
+#define	xfs_dir3_leaf_check(dp, bp) \
 do { \
-	if (!xfs_dir3_leafn_check((mp), (bp))) \
+	if (!xfs_dir3_leafn_check((dp), (bp))) \
 		ASSERT(0); \
 } while (0);
 
 static bool
 xfs_dir3_leafn_check(
-	struct xfs_mount	*mp,
+	struct xfs_inode	*dp,
 	struct xfs_buf		*bp)
 {
 	struct xfs_dir2_leaf	*leaf = bp->b_addr;
@@ -77,10 +77,10 @@ xfs_dir3_leafn_check(
 	} else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC)
 		return false;
 
-	return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf);
+	return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
 }
 #else
-#define	xfs_dir3_leaf_check(mp, bp)
+#define	xfs_dir3_leaf_check(dp, bp)
 #endif
 
 static bool
@@ -408,8 +408,8 @@ xfs_dir2_leaf_to_node(
 		leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
 	lbp->b_ops = &xfs_dir3_leafn_buf_ops;
 	xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF);
-	xfs_dir3_leaf_log_header(tp, lbp);
-	xfs_dir3_leaf_check(mp, lbp);
+	xfs_dir3_leaf_log_header(tp, dp, lbp);
+	xfs_dir3_leaf_check(dp, lbp);
 	return 0;
 }
 
@@ -443,7 +443,7 @@ xfs_dir2_leafn_add(
 	tp = args->trans;
 	leaf = bp->b_addr;
 	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
-	ents = xfs_dir3_leaf_ents_p(leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
 
 	/*
 	 * Quick check just to make sure we are not going to index
@@ -459,7 +459,7 @@ xfs_dir2_leafn_add(
 	 * a compact.
 	 */
 
-	if (leafhdr.count == xfs_dir3_max_leaf_ents(mp, leaf)) {
+	if (leafhdr.count == dp->d_ops->leaf_max_ents(mp)) {
 		if (!leafhdr.stale)
 			return XFS_ERROR(ENOSPC);
 		compact = leafhdr.stale > 1;
@@ -498,9 +498,9 @@ xfs_dir2_leafn_add(
 				args->blkno, args->index));
 
 	xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
-	xfs_dir3_leaf_log_header(tp, bp);
-	xfs_dir3_leaf_log_ents(tp, bp, lfloglow, lfloghigh);
-	xfs_dir3_leaf_check(mp, bp);
+	xfs_dir3_leaf_log_header(tp, dp, bp);
+	xfs_dir3_leaf_log_ents(tp, dp, bp, lfloglow, lfloghigh);
+	xfs_dir3_leaf_check(dp, bp);
 	return 0;
 }
 
@@ -529,6 +529,7 @@ xfs_dir2_free_hdr_check(
  */
 xfs_dahash_t					/* hash value */
 xfs_dir2_leafn_lasthash(
+	struct xfs_inode *dp,
 	struct xfs_buf	*bp,			/* leaf buffer */
 	int		*count)			/* count of entries in leaf */
 {
@@ -546,7 +547,7 @@ xfs_dir2_leafn_lasthash(
 	if (!leafhdr.count)
 		return 0;
 
-	ents = xfs_dir3_leaf_ents_p(leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
 	return be32_to_cpu(ents[leafhdr.count - 1].hashval);
 }
 
@@ -584,9 +585,9 @@ xfs_dir2_leafn_lookup_for_addname(
 	mp = dp->i_mount;
 	leaf = bp->b_addr;
 	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
-	ents = xfs_dir3_leaf_ents_p(leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
 
-	xfs_dir3_leaf_check(mp, bp);
+	xfs_dir3_leaf_check(dp, bp);
 	ASSERT(leafhdr.count > 0);
 
 	/*
@@ -734,9 +735,9 @@ xfs_dir2_leafn_lookup_for_entry(
 	mp = dp->i_mount;
 	leaf = bp->b_addr;
 	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
-	ents = xfs_dir3_leaf_ents_p(leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
 
-	xfs_dir3_leaf_check(mp, bp);
+	xfs_dir3_leaf_check(dp, bp);
 	ASSERT(leafhdr.count > 0);
 
 	/*
@@ -906,7 +907,7 @@ xfs_dir3_leafn_moveents(
 	if (start_d < dhdr->count) {
 		memmove(&dents[start_d + count], &dents[start_d],
 			(dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t));
-		xfs_dir3_leaf_log_ents(tp, bp_d, start_d + count,
+		xfs_dir3_leaf_log_ents(tp, args->dp, bp_d, start_d + count,
 				       count + dhdr->count - 1);
 	}
 	/*
@@ -928,7 +929,8 @@ xfs_dir3_leafn_moveents(
 	 */
 	memcpy(&dents[start_d], &sents[start_s],
 		count * sizeof(xfs_dir2_leaf_entry_t));
-	xfs_dir3_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1);
+	xfs_dir3_leaf_log_ents(tp, args->dp, bp_d,
+			       start_d, start_d + count - 1);
 
 	/*
 	 * If there are source entries after the ones we copied,
@@ -937,7 +939,8 @@ xfs_dir3_leafn_moveents(
 	if (start_s + count < shdr->count) {
 		memmove(&sents[start_s], &sents[start_s + count],
 			count * sizeof(xfs_dir2_leaf_entry_t));
-		xfs_dir3_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1);
+		xfs_dir3_leaf_log_ents(tp, args->dp, bp_s,
+				       start_s, start_s + count - 1);
 	}
 
 	/*
@@ -955,6 +958,7 @@ xfs_dir3_leafn_moveents(
  */
 int						/* sort order */
 xfs_dir2_leafn_order(
+	struct xfs_inode	*dp,
 	struct xfs_buf		*leaf1_bp,		/* leaf1 buffer */
 	struct xfs_buf		*leaf2_bp)		/* leaf2 buffer */
 {
@@ -967,8 +971,8 @@ xfs_dir2_leafn_order(
 
 	xfs_dir3_leaf_hdr_from_disk(&hdr1, leaf1);
 	xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf2);
-	ents1 = xfs_dir3_leaf_ents_p(leaf1);
-	ents2 = xfs_dir3_leaf_ents_p(leaf2);
+	ents1 = dp->d_ops->leaf_ents_p(leaf1);
+	ents2 = dp->d_ops->leaf_ents_p(leaf2);
 
 	if (hdr1.count > 0 && hdr2.count > 0 &&
 	    (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) ||
@@ -1006,12 +1010,13 @@ xfs_dir2_leafn_rebalance(
 	struct xfs_dir2_leaf_entry *ents2;
 	struct xfs_dir3_icleaf_hdr hdr1;
 	struct xfs_dir3_icleaf_hdr hdr2;
+	struct xfs_inode	*dp = state->args->dp;
 
 	args = state->args;
 	/*
 	 * If the block order is wrong, swap the arguments.
 	 */
-	if ((swap = xfs_dir2_leafn_order(blk1->bp, blk2->bp))) {
+	if ((swap = xfs_dir2_leafn_order(dp, blk1->bp, blk2->bp))) {
 		xfs_da_state_blk_t	*tmp;	/* temp for block swap */
 
 		tmp = blk1;
@@ -1022,8 +1027,8 @@ xfs_dir2_leafn_rebalance(
 	leaf2 = blk2->bp->b_addr;
 	xfs_dir3_leaf_hdr_from_disk(&hdr1, leaf1);
 	xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf2);
-	ents1 = xfs_dir3_leaf_ents_p(leaf1);
-	ents2 = xfs_dir3_leaf_ents_p(leaf2);
+	ents1 = dp->d_ops->leaf_ents_p(leaf1);
+	ents2 = dp->d_ops->leaf_ents_p(leaf2);
 
 	oldsum = hdr1.count + hdr2.count;
 #if defined(DEBUG) || defined(XFS_WARN)
@@ -1071,11 +1076,11 @@ xfs_dir2_leafn_rebalance(
 	/* log the changes made when moving the entries */
 	xfs_dir3_leaf_hdr_to_disk(leaf1, &hdr1);
 	xfs_dir3_leaf_hdr_to_disk(leaf2, &hdr2);
-	xfs_dir3_leaf_log_header(args->trans, blk1->bp);
-	xfs_dir3_leaf_log_header(args->trans, blk2->bp);
+	xfs_dir3_leaf_log_header(args->trans, dp, blk1->bp);
+	xfs_dir3_leaf_log_header(args->trans, dp, blk2->bp);
 
-	xfs_dir3_leaf_check(args->dp->i_mount, blk1->bp);
-	xfs_dir3_leaf_check(args->dp->i_mount, blk2->bp);
+	xfs_dir3_leaf_check(dp, blk1->bp);
+	xfs_dir3_leaf_check(dp, blk2->bp);
 
 	/*
 	 * Mark whether we're inserting into the old or new leaf.
@@ -1096,10 +1101,10 @@ xfs_dir2_leafn_rebalance(
 	 * Finally sanity check just to make sure we are not returning a
 	 * negative index
 	 */
-	if(blk2->index < 0) {
+	if (blk2->index < 0) {
 		state->inleaf = 1;
 		blk2->index = 0;
-		xfs_alert(args->dp->i_mount,
+		xfs_alert(dp->i_mount,
 	"%s: picked the wrong leaf? reverting original leaf: blk1->index %d",
 			__func__, blk1->index);
 	}
@@ -1222,7 +1227,7 @@ xfs_dir2_leafn_remove(
 	mp = dp->i_mount;
 	leaf = bp->b_addr;
 	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
-	ents = xfs_dir3_leaf_ents_p(leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
 
 	/*
 	 * Point to the entry we're removing.
@@ -1243,10 +1248,10 @@ xfs_dir2_leafn_remove(
 	 */
 	leafhdr.stale++;
 	xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
-	xfs_dir3_leaf_log_header(tp, bp);
+	xfs_dir3_leaf_log_header(tp, dp, bp);
 
 	lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
-	xfs_dir3_leaf_log_ents(tp, bp, index, index);
+	xfs_dir3_leaf_log_ents(tp, dp, bp, index, index);
 
 	/*
 	 * Make the data entry free.  Keep track of the longest freespace
@@ -1335,12 +1340,12 @@ xfs_dir2_leafn_remove(
 			return error;
 	}
 
-	xfs_dir3_leaf_check(mp, bp);
+	xfs_dir3_leaf_check(dp, bp);
 	/*
 	 * Return indication of whether this leaf block is empty enough
 	 * to justify trying to join it with a neighbor.
 	 */
-	*rval = (xfs_dir3_leaf_hdr_size(leaf) +
+	*rval = (dp->d_ops->leaf_hdr_size() +
 		 (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) <
 		mp->m_dir_magicpct;
 	return 0;
@@ -1359,12 +1364,14 @@ xfs_dir2_leafn_split(
 	xfs_dablk_t		blkno;		/* new leaf block number */
 	int			error;		/* error return value */
 	xfs_mount_t		*mp;		/* filesystem mount point */
+	struct xfs_inode	*dp;
 
 	/*
 	 * Allocate space for a new leaf node.
 	 */
 	args = state->args;
-	mp = args->dp->i_mount;
+	dp = args->dp;
+	mp = dp->i_mount;
 	ASSERT(args != NULL);
 	ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
 	error = xfs_da_grow_inode(args, &blkno);
@@ -1400,10 +1407,10 @@ xfs_dir2_leafn_split(
 	/*
 	 * Update last hashval in each block since we added the name.
 	 */
-	oldblk->hashval = xfs_dir2_leafn_lasthash(oldblk->bp, NULL);
-	newblk->hashval = xfs_dir2_leafn_lasthash(newblk->bp, NULL);
-	xfs_dir3_leaf_check(mp, oldblk->bp);
-	xfs_dir3_leaf_check(mp, newblk->bp);
+	oldblk->hashval = xfs_dir2_leafn_lasthash(dp, oldblk->bp, NULL);
+	newblk->hashval = xfs_dir2_leafn_lasthash(dp, newblk->bp, NULL);
+	xfs_dir3_leaf_check(dp, oldblk->bp);
+	xfs_dir3_leaf_check(dp, newblk->bp);
 	return error;
 }
 
@@ -1433,6 +1440,7 @@ xfs_dir2_leafn_toosmall(
 	int			rval;		/* result from path_shift */
 	struct xfs_dir3_icleaf_hdr leafhdr;
 	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_inode	*dp = state->args->dp;
 
 	/*
 	 * Check for the degenerate case of the block being over 50% full.
@@ -1442,11 +1450,11 @@ xfs_dir2_leafn_toosmall(
 	blk = &state->path.blk[state->path.active - 1];
 	leaf = blk->bp->b_addr;
 	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
-	ents = xfs_dir3_leaf_ents_p(leaf);
-	xfs_dir3_leaf_check(state->args->dp->i_mount, blk->bp);
+	ents = dp->d_ops->leaf_ents_p(leaf);
+	xfs_dir3_leaf_check(dp, blk->bp);
 
 	count = leafhdr.count - leafhdr.stale;
-	bytes = xfs_dir3_leaf_hdr_size(leaf) + count * sizeof(ents[0]);
+	bytes = dp->d_ops->leaf_hdr_size() + count * sizeof(ents[0]);
 	if (bytes > (state->blocksize >> 1)) {
 		/*
 		 * Blk over 50%, don't try to join.
@@ -1491,7 +1499,7 @@ xfs_dir2_leafn_toosmall(
 		/*
 		 * Read the sibling leaf block.
 		 */
-		error = xfs_dir3_leafn_read(state->args->trans, state->args->dp,
+		error = xfs_dir3_leafn_read(state->args->trans, dp,
 					    blkno, -1, &bp);
 		if (error)
 			return error;
@@ -1504,7 +1512,7 @@ xfs_dir2_leafn_toosmall(
 
 		leaf = bp->b_addr;
 		xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf);
-		ents = xfs_dir3_leaf_ents_p(leaf);
+		ents = dp->d_ops->leaf_ents_p(leaf);
 		count += hdr2.count - hdr2.stale;
 		bytes -= count * sizeof(ents[0]);
 
@@ -1558,6 +1566,7 @@ xfs_dir2_leafn_unbalance(
 	struct xfs_dir3_icleaf_hdr drophdr;
 	struct xfs_dir2_leaf_entry *sents;
 	struct xfs_dir2_leaf_entry *dents;
+	struct xfs_inode	*dp = state->args->dp;
 
 	args = state->args;
 	ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
@@ -1567,8 +1576,8 @@ xfs_dir2_leafn_unbalance(
 
 	xfs_dir3_leaf_hdr_from_disk(&savehdr, save_leaf);
 	xfs_dir3_leaf_hdr_from_disk(&drophdr, drop_leaf);
-	sents = xfs_dir3_leaf_ents_p(save_leaf);
-	dents = xfs_dir3_leaf_ents_p(drop_leaf);
+	sents = args->dp->d_ops->leaf_ents_p(save_leaf);
+	dents = args->dp->d_ops->leaf_ents_p(drop_leaf);
 
 	/*
 	 * If there are any stale leaf entries, take this opportunity
@@ -1583,7 +1592,7 @@ xfs_dir2_leafn_unbalance(
 	 * Move the entries from drop to the appropriate end of save.
 	 */
 	drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval);
-	if (xfs_dir2_leafn_order(save_blk->bp, drop_blk->bp))
+	if (xfs_dir2_leafn_order(dp, save_blk->bp, drop_blk->bp))
 		xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
 					save_blk->bp, &savehdr, sents, 0,
 					drophdr.count);
@@ -1596,11 +1605,11 @@ xfs_dir2_leafn_unbalance(
 	/* log the changes made when moving the entries */
 	xfs_dir3_leaf_hdr_to_disk(save_leaf, &savehdr);
 	xfs_dir3_leaf_hdr_to_disk(drop_leaf, &drophdr);
-	xfs_dir3_leaf_log_header(args->trans, save_blk->bp);
-	xfs_dir3_leaf_log_header(args->trans, drop_blk->bp);
+	xfs_dir3_leaf_log_header(args->trans, dp, save_blk->bp);
+	xfs_dir3_leaf_log_header(args->trans, dp, drop_blk->bp);
 
-	xfs_dir3_leaf_check(args->dp->i_mount, save_blk->bp);
-	xfs_dir3_leaf_check(args->dp->i_mount, drop_blk->bp);
+	xfs_dir3_leaf_check(dp, save_blk->bp);
+	xfs_dir3_leaf_check(dp, drop_blk->bp);
 }
 
 /*
@@ -2211,7 +2220,7 @@ xfs_dir2_node_replace(
 		blk = &state->path.blk[state->path.active - 1];
 		ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
 		leaf = blk->bp->b_addr;
-		ents = xfs_dir3_leaf_ents_p(leaf);
+		ents = args->dp->d_ops->leaf_ents_p(leaf);
 		lep = &ents[blk->index];
 		ASSERT(state->extravalid);
 		/*
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 9a69f206daf9..3e4b5ba7b41f 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -77,9 +77,9 @@ extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
 		int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
 extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
 		struct xfs_buf **bpp, __uint16_t magic);
-extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp,
-		int first, int last);
-extern void xfs_dir3_leaf_log_header(struct xfs_trans *tp,
+extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_inode *dp,
+		struct xfs_buf *bp, int first, int last);
+extern void xfs_dir3_leaf_log_header(struct xfs_trans *tp, struct xfs_inode *dp,
 		struct xfs_buf *bp);
 extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args);
 extern int xfs_dir2_leaf_removename(struct xfs_da_args *args);
@@ -98,17 +98,18 @@ extern void xfs_dir3_leaf_hdr_from_disk(struct xfs_dir3_icleaf_hdr *to,
 		struct xfs_dir2_leaf *from);
 extern void xfs_dir3_leaf_hdr_to_disk(struct xfs_dir2_leaf *to,
 		struct xfs_dir3_icleaf_hdr *from);
-extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp,
+extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, struct xfs_inode *dp,
 		struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf);
 
 /* xfs_dir2_node.c */
 extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
 		struct xfs_buf *lbp);
-extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_buf *bp, int *count);
+extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_inode *dp,
+		struct xfs_buf *bp, int *count);
 extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp,
 		struct xfs_da_args *args, int *indexp,
 		struct xfs_da_state *state);
-extern int xfs_dir2_leafn_order(struct xfs_buf *leaf1_bp,
+extern int xfs_dir2_leafn_order(struct xfs_inode *dp, struct xfs_buf *leaf1_bp,
 		struct xfs_buf *leaf2_bp);
 extern int xfs_dir2_leafn_split(struct xfs_da_state *state,
 	struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk);
-- 
cgit v1.3


From 4bceb18f1551c8c047eeb54d48cda9f5453dc12f Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Tue, 29 Oct 2013 22:11:51 +1100
Subject: xfs: vectorise DA btree operations

The remaining non-vectorised code for the directory structure is the
node format blocks. This is shared with the attribute tree, and so
is slightly more complex to vectorise.

Introduce a "non-directory" directory ops structure that is attached
to all non-directory inodes so that attribute operations can be
vectorised for all inodes.

Once we do this, we can vectorise all the da btree operations.
Because this patch adds more infrastructure than it removes the
binary size does not decrease:

   text    data     bss     dec     hex filename
 794490   96802    1096  892388   d9de4 fs/xfs/xfs.o.orig
 792986   96802    1096  890884   d9804 fs/xfs/xfs.o.p1
 792350   96802    1096  890248   d9588 fs/xfs/xfs.o.p2
 789293   96802    1096  887191   d8997 fs/xfs/xfs.o.p3
 789005   96802    1096  886903   d8997 fs/xfs/xfs.o.p4
 789061   96802    1096  886959   d88af fs/xfs/xfs.o.p5
 789733   96802    1096  887631   d8b4f fs/xfs/xfs.o.p6

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_attr_inactive.c |   3 +-
 fs/xfs/xfs_attr_leaf.c     |   3 +-
 fs/xfs/xfs_attr_list.c     |  11 ++---
 fs/xfs/xfs_da_btree.c      | 104 +++++++++++++++++++++++++--------------------
 fs/xfs/xfs_da_format.c     |  59 +++++++++++++++++++++++++
 fs/xfs/xfs_da_format.h     |  25 -----------
 fs/xfs/xfs_dir2.c          |   7 ++-
 fs/xfs/xfs_dir2.h          |   6 +++
 fs/xfs/xfs_iops.c          |   1 +
 fs/xfs/xfs_mount.h         |   1 +
 10 files changed, 139 insertions(+), 81 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index f33fb62b7f17..4855085f8c6b 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -40,6 +40,7 @@
 #include "xfs_quota.h"
 #include "xfs_trace.h"
 #include "xfs_dinode.h"
+#include "xfs_dir2.h"
 
 /*
  * Look at all the extents for this logical region,
@@ -236,7 +237,7 @@ xfs_attr3_node_inactive(
 		xfs_trans_brelse(*trans, bp);
 		return 0;
 	}
-	btree = xfs_da3_node_tree_p(node);
+	btree = dp->d_ops->node_tree_p(node);
 	child_fsb = be32_to_cpu(btree[0].before);
 	xfs_trans_brelse(*trans, bp);	/* no locks for later trans */
 
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index a0f90193a247..82f1354c77b6 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -41,6 +41,7 @@
 #include "xfs_buf_item.h"
 #include "xfs_cksum.h"
 #include "xfs_dinode.h"
+#include "xfs_dir2.h"
 
 
 /*
@@ -916,7 +917,7 @@ xfs_attr3_leaf_to_node(
 		goto out;
 	node = bp1->b_addr;
 	xfs_da3_node_hdr_from_disk(&icnodehdr, node);
-	btree = xfs_da3_node_tree_p(node);
+	btree = dp->d_ops->node_tree_p(node);
 
 	leaf = bp2->b_addr;
 	xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf);
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 46c4ce148a43..ea1c4c46c24f 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -40,6 +40,7 @@
 #include "xfs_buf_item.h"
 #include "xfs_cksum.h"
 #include "xfs_dinode.h"
+#include "xfs_dir2.h"
 
 STATIC int
 xfs_attr_shortform_compare(const void *a, const void *b)
@@ -226,6 +227,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 	struct xfs_da_node_entry *btree;
 	int error, i;
 	struct xfs_buf *bp;
+	struct xfs_inode	*dp = context->dp;
 
 	trace_xfs_attr_node_list(context);
 
@@ -239,7 +241,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 	 */
 	bp = NULL;
 	if (cursor->blkno > 0) {
-		error = xfs_da3_node_read(NULL, context->dp, cursor->blkno, -1,
+		error = xfs_da3_node_read(NULL, dp, cursor->blkno, -1,
 					      &bp, XFS_ATTR_FORK);
 		if ((error != 0) && (error != EFSCORRUPTED))
 			return(error);
@@ -289,7 +291,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 		for (;;) {
 			__uint16_t magic;
 
-			error = xfs_da3_node_read(NULL, context->dp,
+			error = xfs_da3_node_read(NULL, dp,
 						      cursor->blkno, -1, &bp,
 						      XFS_ATTR_FORK);
 			if (error)
@@ -310,7 +312,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 			}
 
 			xfs_da3_node_hdr_from_disk(&nodehdr, node);
-			btree = xfs_da3_node_tree_p(node);
+			btree = dp->d_ops->node_tree_p(node);
 			for (i = 0; i < nodehdr.count; btree++, i++) {
 				if (cursor->hashval
 						<= be32_to_cpu(btree->hashval)) {
@@ -346,8 +348,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 			break;
 		cursor->blkno = leafhdr.forw;
 		xfs_trans_brelse(NULL, bp);
-		error = xfs_attr3_leaf_read(NULL, context->dp, cursor->blkno, -1,
-					   &bp);
+		error = xfs_attr3_leaf_read(NULL, dp, cursor->blkno, -1, &bp);
 		if (error)
 			return error;
 	}
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index df3759c51470..26dfc42a28f9 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -379,7 +379,8 @@ xfs_da3_node_create(
 
 	xfs_da3_node_hdr_to_disk(node, &ichdr);
 	xfs_trans_log_buf(tp, bp,
-		XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node)));
+		XFS_DA_LOGRANGE(node, &node->hdr,
+				args->dp->d_ops->node_hdr_size()));
 
 	*bpp = bp;
 	return(0);
@@ -590,7 +591,7 @@ xfs_da3_root_split(
 		struct xfs_da3_icnode_hdr nodehdr;
 
 		xfs_da3_node_hdr_from_disk(&nodehdr, oldroot);
-		btree = xfs_da3_node_tree_p(oldroot);
+		btree = dp->d_ops->node_tree_p(oldroot);
 		size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot);
 		level = nodehdr.level;
 
@@ -650,7 +651,7 @@ xfs_da3_root_split(
 
 	node = bp->b_addr;
 	xfs_da3_node_hdr_from_disk(&nodehdr, node);
-	btree = xfs_da3_node_tree_p(node);
+	btree = dp->d_ops->node_tree_p(node);
 	btree[0].hashval = cpu_to_be32(blk1->hashval);
 	btree[0].before = cpu_to_be32(blk1->blkno);
 	btree[1].hashval = cpu_to_be32(blk2->hashval);
@@ -793,6 +794,7 @@ xfs_da3_node_rebalance(
 	int			count;
 	int			tmp;
 	int			swap = 0;
+	struct xfs_inode	*dp = state->args->dp;
 
 	trace_xfs_da_node_rebalance(state->args);
 
@@ -800,8 +802,8 @@ xfs_da3_node_rebalance(
 	node2 = blk2->bp->b_addr;
 	xfs_da3_node_hdr_from_disk(&nodehdr1, node1);
 	xfs_da3_node_hdr_from_disk(&nodehdr2, node2);
-	btree1 = xfs_da3_node_tree_p(node1);
-	btree2 = xfs_da3_node_tree_p(node2);
+	btree1 = dp->d_ops->node_tree_p(node1);
+	btree2 = dp->d_ops->node_tree_p(node2);
 
 	/*
 	 * Figure out how many entries need to move, and in which direction.
@@ -816,8 +818,8 @@ xfs_da3_node_rebalance(
 		node2 = tmpnode;
 		xfs_da3_node_hdr_from_disk(&nodehdr1, node1);
 		xfs_da3_node_hdr_from_disk(&nodehdr2, node2);
-		btree1 = xfs_da3_node_tree_p(node1);
-		btree2 = xfs_da3_node_tree_p(node2);
+		btree1 = dp->d_ops->node_tree_p(node1);
+		btree2 = dp->d_ops->node_tree_p(node2);
 		swap = 1;
 	}
 
@@ -882,12 +884,12 @@ xfs_da3_node_rebalance(
 	xfs_da3_node_hdr_to_disk(node1, &nodehdr1);
 	xfs_trans_log_buf(tp, blk1->bp,
 		XFS_DA_LOGRANGE(node1, &node1->hdr,
-				xfs_da3_node_hdr_size(node1)));
+				dp->d_ops->node_hdr_size()));
 
 	xfs_da3_node_hdr_to_disk(node2, &nodehdr2);
 	xfs_trans_log_buf(tp, blk2->bp,
 		XFS_DA_LOGRANGE(node2, &node2->hdr,
-				xfs_da3_node_hdr_size(node2) +
+				dp->d_ops->node_hdr_size() +
 				(sizeof(btree2[0]) * nodehdr2.count)));
 
 	/*
@@ -899,8 +901,8 @@ xfs_da3_node_rebalance(
 		node2 = blk2->bp->b_addr;
 		xfs_da3_node_hdr_from_disk(&nodehdr1, node1);
 		xfs_da3_node_hdr_from_disk(&nodehdr2, node2);
-		btree1 = xfs_da3_node_tree_p(node1);
-		btree2 = xfs_da3_node_tree_p(node2);
+		btree1 = dp->d_ops->node_tree_p(node1);
+		btree2 = dp->d_ops->node_tree_p(node2);
 	}
 	blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval);
 	blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval);
@@ -927,12 +929,13 @@ xfs_da3_node_add(
 	struct xfs_da3_icnode_hdr nodehdr;
 	struct xfs_da_node_entry *btree;
 	int			tmp;
+	struct xfs_inode	*dp = state->args->dp;
 
 	trace_xfs_da_node_add(state->args);
 
 	node = oldblk->bp->b_addr;
 	xfs_da3_node_hdr_from_disk(&nodehdr, node);
-	btree = xfs_da3_node_tree_p(node);
+	btree = dp->d_ops->node_tree_p(node);
 
 	ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count);
 	ASSERT(newblk->blkno != 0);
@@ -957,7 +960,7 @@ xfs_da3_node_add(
 	nodehdr.count += 1;
 	xfs_da3_node_hdr_to_disk(node, &nodehdr);
 	xfs_trans_log_buf(state->args->trans, oldblk->bp,
-		XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node)));
+		XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size()));
 
 	/*
 	 * Copy the last hash value from the oldblk to propagate upwards.
@@ -1115,7 +1118,7 @@ xfs_da3_root_join(
 	 * Read in the (only) child block, then copy those bytes into
 	 * the root block's buffer and free the original child block.
 	 */
-	btree = xfs_da3_node_tree_p(oldroot);
+	btree = args->dp->d_ops->node_tree_p(oldroot);
 	child = be32_to_cpu(btree[0].before);
 	ASSERT(child != 0);
 	error = xfs_da3_node_read(args->trans, args->dp, child, -1, &bp,
@@ -1275,6 +1278,7 @@ xfs_da3_node_toosmall(
  */
 STATIC uint
 xfs_da3_node_lasthash(
+	struct xfs_inode	*dp,
 	struct xfs_buf		*bp,
 	int			*count)
 {
@@ -1288,7 +1292,7 @@ xfs_da3_node_lasthash(
 		*count = nodehdr.count;
 	if (!nodehdr.count)
 		return 0;
-	btree = xfs_da3_node_tree_p(node);
+	btree = dp->d_ops->node_tree_p(node);
 	return be32_to_cpu(btree[nodehdr.count - 1].hashval);
 }
 
@@ -1307,6 +1311,7 @@ xfs_da3_fixhashpath(
 	xfs_dahash_t		lasthash=0;
 	int			level;
 	int			count;
+	struct xfs_inode	*dp = state->args->dp;
 
 	trace_xfs_da_fixhashpath(state->args);
 
@@ -1319,13 +1324,12 @@ xfs_da3_fixhashpath(
 			return;
 		break;
 	case XFS_DIR2_LEAFN_MAGIC:
-		lasthash = xfs_dir2_leafn_lasthash(state->args->dp,
-						   blk->bp, &count);
+		lasthash = xfs_dir2_leafn_lasthash(dp, blk->bp, &count);
 		if (count == 0)
 			return;
 		break;
 	case XFS_DA_NODE_MAGIC:
-		lasthash = xfs_da3_node_lasthash(blk->bp, &count);
+		lasthash = xfs_da3_node_lasthash(dp, blk->bp, &count);
 		if (count == 0)
 			return;
 		break;
@@ -1335,7 +1339,7 @@ xfs_da3_fixhashpath(
 
 		node = blk->bp->b_addr;
 		xfs_da3_node_hdr_from_disk(&nodehdr, node);
-		btree = xfs_da3_node_tree_p(node);
+		btree = dp->d_ops->node_tree_p(node);
 		if (be32_to_cpu(btree->hashval) == lasthash)
 			break;
 		blk->hashval = lasthash;
@@ -1361,6 +1365,7 @@ xfs_da3_node_remove(
 	struct xfs_da_node_entry *btree;
 	int			index;
 	int			tmp;
+	struct xfs_inode	*dp = state->args->dp;
 
 	trace_xfs_da_node_remove(state->args);
 
@@ -1373,7 +1378,7 @@ xfs_da3_node_remove(
 	 * Copy over the offending entry, or just zero it out.
 	 */
 	index = drop_blk->index;
-	btree = xfs_da3_node_tree_p(node);
+	btree = dp->d_ops->node_tree_p(node);
 	if (index < nodehdr.count - 1) {
 		tmp  = nodehdr.count - index - 1;
 		tmp *= (uint)sizeof(xfs_da_node_entry_t);
@@ -1388,7 +1393,7 @@ xfs_da3_node_remove(
 	nodehdr.count -= 1;
 	xfs_da3_node_hdr_to_disk(node, &nodehdr);
 	xfs_trans_log_buf(state->args->trans, drop_blk->bp,
-	    XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node)));
+	    XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size()));
 
 	/*
 	 * Copy the last hash value from the block to propagate upwards.
@@ -1415,6 +1420,7 @@ xfs_da3_node_unbalance(
 	struct xfs_trans	*tp;
 	int			sindex;
 	int			tmp;
+	struct xfs_inode	*dp = state->args->dp;
 
 	trace_xfs_da_node_unbalance(state->args);
 
@@ -1422,8 +1428,8 @@ xfs_da3_node_unbalance(
 	save_node = save_blk->bp->b_addr;
 	xfs_da3_node_hdr_from_disk(&drop_hdr, drop_node);
 	xfs_da3_node_hdr_from_disk(&save_hdr, save_node);
-	drop_btree = xfs_da3_node_tree_p(drop_node);
-	save_btree = xfs_da3_node_tree_p(save_node);
+	drop_btree = dp->d_ops->node_tree_p(drop_node);
+	save_btree = dp->d_ops->node_tree_p(save_node);
 	tp = state->args->trans;
 
 	/*
@@ -1460,7 +1466,7 @@ xfs_da3_node_unbalance(
 	xfs_da3_node_hdr_to_disk(save_node, &save_hdr);
 	xfs_trans_log_buf(tp, save_blk->bp,
 		XFS_DA_LOGRANGE(save_node, &save_node->hdr,
-				xfs_da3_node_hdr_size(save_node)));
+				dp->d_ops->node_hdr_size()));
 
 	/*
 	 * Save the last hashval in the remaining block for upward propagation.
@@ -1502,6 +1508,7 @@ xfs_da3_node_lookup_int(
 	int			max;
 	int			error;
 	int			retval;
+	struct xfs_inode	*dp = state->args->dp;
 
 	args = state->args;
 
@@ -1550,7 +1557,7 @@ xfs_da3_node_lookup_int(
 		 */
 		node = blk->bp->b_addr;
 		xfs_da3_node_hdr_from_disk(&nodehdr, node);
-		btree = xfs_da3_node_tree_p(node);
+		btree = dp->d_ops->node_tree_p(node);
 
 		max = nodehdr.count;
 		blk->hashval = be32_to_cpu(btree[max - 1].hashval);
@@ -1645,6 +1652,7 @@ xfs_da3_node_lookup_int(
  */
 STATIC int
 xfs_da3_node_order(
+	struct xfs_inode *dp,
 	struct xfs_buf	*node1_bp,
 	struct xfs_buf	*node2_bp)
 {
@@ -1659,8 +1667,8 @@ xfs_da3_node_order(
 	node2 = node2_bp->b_addr;
 	xfs_da3_node_hdr_from_disk(&node1hdr, node1);
 	xfs_da3_node_hdr_from_disk(&node2hdr, node2);
-	btree1 = xfs_da3_node_tree_p(node1);
-	btree2 = xfs_da3_node_tree_p(node2);
+	btree1 = dp->d_ops->node_tree_p(node1);
+	btree2 = dp->d_ops->node_tree_p(node2);
 
 	if (node1hdr.count > 0 && node2hdr.count > 0 &&
 	    ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
@@ -1687,6 +1695,7 @@ xfs_da3_blk_link(
 	struct xfs_buf		*bp;
 	int			before = 0;
 	int			error;
+	struct xfs_inode	*dp = state->args->dp;
 
 	/*
 	 * Set up environment.
@@ -1704,10 +1713,10 @@ xfs_da3_blk_link(
 		before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
 		break;
 	case XFS_DIR2_LEAFN_MAGIC:
-		before = xfs_dir2_leafn_order(args->dp, old_blk->bp, new_blk->bp);
+		before = xfs_dir2_leafn_order(dp, old_blk->bp, new_blk->bp);
 		break;
 	case XFS_DA_NODE_MAGIC:
-		before = xfs_da3_node_order(old_blk->bp, new_blk->bp);
+		before = xfs_da3_node_order(dp, old_blk->bp, new_blk->bp);
 		break;
 	}
 
@@ -1722,7 +1731,7 @@ xfs_da3_blk_link(
 		new_info->forw = cpu_to_be32(old_blk->blkno);
 		new_info->back = old_info->back;
 		if (old_info->back) {
-			error = xfs_da3_node_read(args->trans, args->dp,
+			error = xfs_da3_node_read(args->trans, dp,
 						be32_to_cpu(old_info->back),
 						-1, &bp, args->whichfork);
 			if (error)
@@ -1743,7 +1752,7 @@ xfs_da3_blk_link(
 		new_info->forw = old_info->forw;
 		new_info->back = cpu_to_be32(old_blk->blkno);
 		if (old_info->forw) {
-			error = xfs_da3_node_read(args->trans, args->dp,
+			error = xfs_da3_node_read(args->trans, dp,
 						be32_to_cpu(old_info->forw),
 						-1, &bp, args->whichfork);
 			if (error)
@@ -1863,6 +1872,7 @@ xfs_da3_path_shift(
 	xfs_dablk_t		blkno = 0;
 	int			level;
 	int			error;
+	struct xfs_inode	*dp = state->args->dp;
 
 	trace_xfs_da_path_shift(state->args);
 
@@ -1879,7 +1889,7 @@ xfs_da3_path_shift(
 	for (blk = &path->blk[level]; level >= 0; blk--, level--) {
 		node = blk->bp->b_addr;
 		xfs_da3_node_hdr_from_disk(&nodehdr, node);
-		btree = xfs_da3_node_tree_p(node);
+		btree = dp->d_ops->node_tree_p(node);
 
 		if (forward && (blk->index < nodehdr.count - 1)) {
 			blk->index++;
@@ -1913,7 +1923,7 @@ xfs_da3_path_shift(
 		 * Read the next child block.
 		 */
 		blk->blkno = blkno;
-		error = xfs_da3_node_read(args->trans, args->dp, blkno, -1,
+		error = xfs_da3_node_read(args->trans, dp, blkno, -1,
 					&blk->bp, args->whichfork);
 		if (error)
 			return(error);
@@ -1936,7 +1946,7 @@ xfs_da3_path_shift(
 			blk->magic = XFS_DA_NODE_MAGIC;
 			node = (xfs_da_intnode_t *)info;
 			xfs_da3_node_hdr_from_disk(&nodehdr, node);
-			btree = xfs_da3_node_tree_p(node);
+			btree = dp->d_ops->node_tree_p(node);
 			blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
 			if (forward)
 				blk->index = 0;
@@ -2164,7 +2174,7 @@ xfs_da3_swap_lastblock(
 	struct xfs_dir2_leaf	*dead_leaf2;
 	struct xfs_da_node_entry *btree;
 	struct xfs_da3_icnode_hdr par_hdr;
-	struct xfs_inode	*ip;
+	struct xfs_inode	*dp;
 	struct xfs_trans	*tp;
 	struct xfs_mount	*mp;
 	struct xfs_buf		*dead_buf;
@@ -2188,12 +2198,12 @@ xfs_da3_swap_lastblock(
 	dead_buf = *dead_bufp;
 	dead_blkno = *dead_blknop;
 	tp = args->trans;
-	ip = args->dp;
+	dp = args->dp;
 	w = args->whichfork;
 	ASSERT(w == XFS_DATA_FORK);
-	mp = ip->i_mount;
+	mp = dp->i_mount;
 	lastoff = mp->m_dirfreeblk;
-	error = xfs_bmap_last_before(tp, ip, &lastoff, w);
+	error = xfs_bmap_last_before(tp, dp, &lastoff, w);
 	if (error)
 		return error;
 	if (unlikely(lastoff == 0)) {
@@ -2205,7 +2215,7 @@ xfs_da3_swap_lastblock(
 	 * Read the last block in the btree space.
 	 */
 	last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
-	error = xfs_da3_node_read(tp, ip, last_blkno, -1, &last_buf, w);
+	error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w);
 	if (error)
 		return error;
 	/*
@@ -2224,7 +2234,7 @@ xfs_da3_swap_lastblock(
 
 		dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
 		xfs_dir3_leaf_hdr_from_disk(&leafhdr, dead_leaf2);
-		ents = ip->d_ops->leaf_ents_p(dead_leaf2);
+		ents = dp->d_ops->leaf_ents_p(dead_leaf2);
 		dead_level = 0;
 		dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval);
 	} else {
@@ -2232,7 +2242,7 @@ xfs_da3_swap_lastblock(
 
 		dead_node = (xfs_da_intnode_t *)dead_info;
 		xfs_da3_node_hdr_from_disk(&deadhdr, dead_node);
-		btree = xfs_da3_node_tree_p(dead_node);
+		btree = dp->d_ops->node_tree_p(dead_node);
 		dead_level = deadhdr.level;
 		dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval);
 	}
@@ -2241,7 +2251,7 @@ xfs_da3_swap_lastblock(
 	 * If the moved block has a left sibling, fix up the pointers.
 	 */
 	if ((sib_blkno = be32_to_cpu(dead_info->back))) {
-		error = xfs_da3_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
+		error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
 		if (error)
 			goto done;
 		sib_info = sib_buf->b_addr;
@@ -2263,7 +2273,7 @@ xfs_da3_swap_lastblock(
 	 * If the moved block has a right sibling, fix up the pointers.
 	 */
 	if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
-		error = xfs_da3_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
+		error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
 		if (error)
 			goto done;
 		sib_info = sib_buf->b_addr;
@@ -2287,7 +2297,7 @@ xfs_da3_swap_lastblock(
 	 * Walk down the tree looking for the parent of the moved block.
 	 */
 	for (;;) {
-		error = xfs_da3_node_read(tp, ip, par_blkno, -1, &par_buf, w);
+		error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
 		if (error)
 			goto done;
 		par_node = par_buf->b_addr;
@@ -2299,7 +2309,7 @@ xfs_da3_swap_lastblock(
 			goto done;
 		}
 		level = par_hdr.level;
-		btree = xfs_da3_node_tree_p(par_node);
+		btree = dp->d_ops->node_tree_p(par_node);
 		for (entno = 0;
 		     entno < par_hdr.count &&
 		     be32_to_cpu(btree[entno].hashval) < dead_hash;
@@ -2338,7 +2348,7 @@ xfs_da3_swap_lastblock(
 			error = XFS_ERROR(EFSCORRUPTED);
 			goto done;
 		}
-		error = xfs_da3_node_read(tp, ip, par_blkno, -1, &par_buf, w);
+		error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
 		if (error)
 			goto done;
 		par_node = par_buf->b_addr;
@@ -2349,7 +2359,7 @@ xfs_da3_swap_lastblock(
 			error = XFS_ERROR(EFSCORRUPTED);
 			goto done;
 		}
-		btree = xfs_da3_node_tree_p(par_node);
+		btree = dp->d_ops->node_tree_p(par_node);
 		entno = 0;
 	}
 	/*
diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/xfs_da_format.c
index ff8b50368c94..72b48b5ec69a 100644
--- a/fs/xfs/xfs_da_format.c
+++ b/fs/xfs/xfs_da_format.c
@@ -477,6 +477,33 @@ xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp)
 	return ((struct xfs_dir3_leaf *)lp)->__ents;
 }
 
+/*
+ * Directory/Attribute Node block operations
+ */
+static inline int
+xfs_da2_node_hdr_size(void)
+{
+	return sizeof(struct xfs_da_node_hdr);
+}
+
+static struct xfs_da_node_entry *
+xfs_da2_node_tree_p(struct xfs_da_intnode *dap)
+{
+	return dap->__btree;
+}
+
+static inline int
+xfs_da3_node_hdr_size(void)
+{
+	return sizeof(struct xfs_da3_node_hdr);
+}
+
+static inline struct xfs_da_node_entry *
+xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
+{
+	return ((struct xfs_da3_intnode *)dap)->__btree;
+}
+
 const struct xfs_dir_ops xfs_dir2_ops = {
 	.sf_entsize = xfs_dir2_sf_entsize,
 	.sf_nextentry = xfs_dir2_sf_nextentry,
@@ -508,6 +535,8 @@ const struct xfs_dir_ops xfs_dir2_ops = {
 	.leaf_max_ents = xfs_dir2_max_leaf_ents,
 	.leaf_ents_p = xfs_dir2_leaf_ents_p,
 
+	.node_hdr_size = xfs_da2_node_hdr_size,
+	.node_tree_p = xfs_da2_node_tree_p,
 };
 
 const struct xfs_dir_ops xfs_dir2_ftype_ops = {
@@ -540,6 +569,9 @@ const struct xfs_dir_ops xfs_dir2_ftype_ops = {
 	.leaf_hdr_size = xfs_dir2_leaf_hdr_size,
 	.leaf_max_ents = xfs_dir2_max_leaf_ents,
 	.leaf_ents_p = xfs_dir2_leaf_ents_p,
+
+	.node_hdr_size = xfs_da2_node_hdr_size,
+	.node_tree_p = xfs_da2_node_tree_p,
 };
 
 const struct xfs_dir_ops xfs_dir3_ops = {
@@ -572,6 +604,19 @@ const struct xfs_dir_ops xfs_dir3_ops = {
 	.leaf_hdr_size = xfs_dir3_leaf_hdr_size,
 	.leaf_max_ents = xfs_dir3_max_leaf_ents,
 	.leaf_ents_p = xfs_dir3_leaf_ents_p,
+
+	.node_hdr_size = xfs_da3_node_hdr_size,
+	.node_tree_p = xfs_da3_node_tree_p,
+};
+
+const struct xfs_dir_ops xfs_dir2_nondir_ops = {
+	.node_hdr_size = xfs_da2_node_hdr_size,
+	.node_tree_p = xfs_da2_node_tree_p,
+};
+
+const struct xfs_dir_ops xfs_dir3_nondir_ops = {
+	.node_hdr_size = xfs_da3_node_hdr_size,
+	.node_tree_p = xfs_da3_node_tree_p,
 };
 
 /*
@@ -594,3 +639,17 @@ xfs_dir_get_ops(
 		return &xfs_dir2_ftype_ops;
 	return &xfs_dir2_ops;
 }
+
+const struct xfs_dir_ops *
+xfs_nondir_get_ops(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*dp)
+{
+	if (dp)
+		return dp->d_ops;
+	if (mp->m_nondir_inode_ops)
+		return mp->m_nondir_inode_ops;
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		return &xfs_dir3_nondir_ops;
+	return &xfs_dir2_nondir_ops;
+}
diff --git a/fs/xfs/xfs_da_format.h b/fs/xfs/xfs_da_format.h
index 0a567e25cf59..69b4c6e1c52e 100644
--- a/fs/xfs/xfs_da_format.h
+++ b/fs/xfs/xfs_da_format.h
@@ -127,31 +127,6 @@ extern void xfs_da3_node_hdr_from_disk(struct xfs_da3_icnode_hdr *to,
 extern void xfs_da3_node_hdr_to_disk(struct xfs_da_intnode *to,
 				     struct xfs_da3_icnode_hdr *from);
 
-static inline int
-__xfs_da3_node_hdr_size(bool v3)
-{
-	if (v3)
-		return sizeof(struct xfs_da3_node_hdr);
-	return sizeof(struct xfs_da_node_hdr);
-}
-static inline int
-xfs_da3_node_hdr_size(struct xfs_da_intnode *dap)
-{
-	bool	v3 = dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC);
-
-	return __xfs_da3_node_hdr_size(v3);
-}
-
-static inline struct xfs_da_node_entry *
-xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
-{
-	if (dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
-		struct xfs_da3_intnode *dap3 = (struct xfs_da3_intnode *)dap;
-		return dap3->__btree;
-	}
-	return dap->__btree;
-}
-
 extern void xfs_da3_intnode_from_disk(struct xfs_da3_icnode_hdr *to,
 				      struct xfs_da_intnode *from);
 extern void xfs_da3_intnode_to_disk(struct xfs_da_intnode *to,
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 2b98a33ca383..1b44e83924b7 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -95,13 +95,17 @@ xfs_dir_mount(
 	ASSERT(xfs_sb_version_hasdirv2(&mp->m_sb));
 	ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
 	       XFS_MAX_BLOCKSIZE);
+
+	mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL);
+	mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL);
+
 	mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog);
 	mp->m_dirblkfsbs = 1 << mp->m_sb.sb_dirblklog;
 	mp->m_dirdatablk = xfs_dir2_db_to_da(mp, XFS_DIR2_DATA_FIRSTDB(mp));
 	mp->m_dirleafblk = xfs_dir2_db_to_da(mp, XFS_DIR2_LEAF_FIRSTDB(mp));
 	mp->m_dirfreeblk = xfs_dir2_db_to_da(mp, XFS_DIR2_FREE_FIRSTDB(mp));
 
-	nodehdr_size = __xfs_da3_node_hdr_size(xfs_sb_version_hascrc(&mp->m_sb));
+	nodehdr_size = mp->m_dir_inode_ops->node_hdr_size();
 	mp->m_attr_node_ents = (mp->m_sb.sb_blocksize - nodehdr_size) /
 				(uint)sizeof(xfs_da_node_entry_t);
 	mp->m_dir_node_ents = (mp->m_dirblksize - nodehdr_size) /
@@ -113,7 +117,6 @@ xfs_dir_mount(
 	else
 		mp->m_dirnameops = &xfs_default_nameops;
 
-	mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL);
 }
 
 /*
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 9ba9db731990..c5cad9d9239d 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -79,10 +79,16 @@ struct xfs_dir_ops {
 	int	(*leaf_max_ents)(struct xfs_mount *mp);
 	struct xfs_dir2_leaf_entry *
 		(*leaf_ents_p)(struct xfs_dir2_leaf *lp);
+
+	int	(*node_hdr_size)(void);
+	struct xfs_da_node_entry *
+		(*node_tree_p)(struct xfs_da_intnode *dap);
 };
 
 extern const struct xfs_dir_ops *
 	xfs_dir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
+extern const struct xfs_dir_ops *
+	xfs_nondir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
 
 /*
  * Generic directory interface routines
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 0493587ea6bc..c4cd6d47f523 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1203,6 +1203,7 @@ xfs_setup_inode(
 	inode->i_ctime.tv_nsec	= ip->i_d.di_ctime.t_nsec;
 	xfs_diflags_to_iflags(inode, ip);
 
+	ip->d_ops = ip->i_mount->m_nondir_inode_ops;
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_op = &xfs_inode_operations;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 973397f66c6b..1d8101a10d8e 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -150,6 +150,7 @@ typedef struct xfs_mount {
 	__uint8_t		m_sectbb_log;	/* sectlog - BBSHIFT */
 	const struct xfs_nameops *m_dirnameops;	/* vector of dir name ops */
 	const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */
+	const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */
 	int			m_dirblksize;	/* directory block sz--bytes */
 	int			m_dirblkfsbs;	/* directory block sz--fsbs */
 	xfs_dablk_t		m_dirdatablk;	/* blockno of dir data v2 */
-- 
cgit v1.3


From 01ba43b873d9e91ba2e0341fe8cb7e89eaa41661 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 29 Oct 2013 22:11:52 +1100
Subject: xfs: vectorise encoding/decoding directory headers

Conversion from on-disk structures to in-core header structures
currently relies on magic number checks. If the magic number is
wrong, but one of the supported values, we do the wrong thing with
the encode/decode operation. Split these functions so that there are
discrete operations for the specific directory format we are
handling.

In doing this, move all the header encode/decode functions to
xfs_da_format.c as they are directly manipulating the on-disk
format. It should be noted that all the growth in binary size is
from xfs_da_format.c - the rest of the code actaully shrinks.

   text    data     bss     dec     hex filename
 794490   96802    1096  892388   d9de4 fs/xfs/xfs.o.orig
 792986   96802    1096  890884   d9804 fs/xfs/xfs.o.p1
 792350   96802    1096  890248   d9588 fs/xfs/xfs.o.p2
 789293   96802    1096  887191   d8997 fs/xfs/xfs.o.p3
 789005   96802    1096  886903   d8997 fs/xfs/xfs.o.p4
 789061   96802    1096  886959   d88af fs/xfs/xfs.o.p5
 789733   96802    1096  887631   d8b4f fs/xfs/xfs.o.p6
 791421   96802    1096  889319   d91e7 fs/xfs/xfs.o.p7

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_attr_inactive.c |   2 +-
 fs/xfs/xfs_attr_leaf.c     |   4 +-
 fs/xfs/xfs_attr_list.c     |   2 +-
 fs/xfs/xfs_da_btree.c      | 141 +++++++++++-------------------
 fs/xfs/xfs_da_format.c     | 209 ++++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_da_format.h     |  16 ----
 fs/xfs/xfs_dir2.h          |  13 +++
 fs/xfs/xfs_dir2_block.c    |   2 +-
 fs/xfs/xfs_dir2_leaf.c     | 100 ++++++----------------
 fs/xfs/xfs_dir2_node.c     | 126 +++++++++------------------
 fs/xfs/xfs_dir2_priv.h     |   4 -
 11 files changed, 338 insertions(+), 281 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 4855085f8c6b..09480c57f069 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -231,7 +231,7 @@ xfs_attr3_node_inactive(
 	}
 
 	node = bp->b_addr;
-	xfs_da3_node_hdr_from_disk(&ichdr, node);
+	dp->d_ops->node_hdr_from_disk(&ichdr, node);
 	parent_blkno = bp->b_bn;
 	if (!ichdr.count) {
 		xfs_trans_brelse(*trans, bp);
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 82f1354c77b6..fdf9992dcbbc 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -916,7 +916,7 @@ xfs_attr3_leaf_to_node(
 	if (error)
 		goto out;
 	node = bp1->b_addr;
-	xfs_da3_node_hdr_from_disk(&icnodehdr, node);
+	dp->d_ops->node_hdr_from_disk(&icnodehdr, node);
 	btree = dp->d_ops->node_tree_p(node);
 
 	leaf = bp2->b_addr;
@@ -927,7 +927,7 @@ xfs_attr3_leaf_to_node(
 	btree[0].hashval = entries[icleafhdr.count - 1].hashval;
 	btree[0].before = cpu_to_be32(blkno);
 	icnodehdr.count = 1;
-	xfs_da3_node_hdr_to_disk(node, &icnodehdr);
+	dp->d_ops->node_hdr_to_disk(node, &icnodehdr);
 	xfs_trans_log_buf(args->trans, bp1, 0, XFS_LBSIZE(mp) - 1);
 	error = 0;
 out:
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index ea1c4c46c24f..2d174b128153 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -311,7 +311,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 				return XFS_ERROR(EFSCORRUPTED);
 			}
 
-			xfs_da3_node_hdr_from_disk(&nodehdr, node);
+			dp->d_ops->node_hdr_from_disk(&nodehdr, node);
 			btree = dp->d_ops->node_tree_p(node);
 			for (i = 0; i < nodehdr.count; btree++, i++) {
 				if (cursor->hashval
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 26dfc42a28f9..a51762dae543 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -129,56 +129,6 @@ xfs_da_state_free(xfs_da_state_t *state)
 	kmem_zone_free(xfs_da_state_zone, state);
 }
 
-void
-xfs_da3_node_hdr_from_disk(
-	struct xfs_da3_icnode_hdr	*to,
-	struct xfs_da_intnode		*from)
-{
-	ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
-	       from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
-
-	if (from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
-		struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from;
-
-		to->forw = be32_to_cpu(hdr3->info.hdr.forw);
-		to->back = be32_to_cpu(hdr3->info.hdr.back);
-		to->magic = be16_to_cpu(hdr3->info.hdr.magic);
-		to->count = be16_to_cpu(hdr3->__count);
-		to->level = be16_to_cpu(hdr3->__level);
-		return;
-	}
-	to->forw = be32_to_cpu(from->hdr.info.forw);
-	to->back = be32_to_cpu(from->hdr.info.back);
-	to->magic = be16_to_cpu(from->hdr.info.magic);
-	to->count = be16_to_cpu(from->hdr.__count);
-	to->level = be16_to_cpu(from->hdr.__level);
-}
-
-void
-xfs_da3_node_hdr_to_disk(
-	struct xfs_da_intnode		*to,
-	struct xfs_da3_icnode_hdr	*from)
-{
-	ASSERT(from->magic == XFS_DA_NODE_MAGIC ||
-	       from->magic == XFS_DA3_NODE_MAGIC);
-
-	if (from->magic == XFS_DA3_NODE_MAGIC) {
-		struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to;
-
-		hdr3->info.hdr.forw = cpu_to_be32(from->forw);
-		hdr3->info.hdr.back = cpu_to_be32(from->back);
-		hdr3->info.hdr.magic = cpu_to_be16(from->magic);
-		hdr3->__count = cpu_to_be16(from->count);
-		hdr3->__level = cpu_to_be16(from->level);
-		return;
-	}
-	to->hdr.info.forw = cpu_to_be32(from->forw);
-	to->hdr.info.back = cpu_to_be32(from->back);
-	to->hdr.info.magic = cpu_to_be16(from->magic);
-	to->hdr.__count = cpu_to_be16(from->count);
-	to->hdr.__level = cpu_to_be16(from->level);
-}
-
 static bool
 xfs_da3_node_verify(
 	struct xfs_buf		*bp)
@@ -186,8 +136,11 @@ xfs_da3_node_verify(
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
 	struct xfs_da_intnode	*hdr = bp->b_addr;
 	struct xfs_da3_icnode_hdr ichdr;
+	const struct xfs_dir_ops *ops;
+
+	ops = xfs_dir_get_ops(mp, NULL);
 
-	xfs_da3_node_hdr_from_disk(&ichdr, hdr);
+	ops->node_hdr_from_disk(&ichdr, hdr);
 
 	if (xfs_sb_version_hascrc(&mp->m_sb)) {
 		struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
@@ -354,11 +307,12 @@ xfs_da3_node_create(
 	struct xfs_da3_icnode_hdr ichdr = {0};
 	struct xfs_buf		*bp;
 	int			error;
+	struct xfs_inode	*dp = args->dp;
 
 	trace_xfs_da_node_create(args);
 	ASSERT(level <= XFS_DA_NODE_MAXDEPTH);
 
-	error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork);
+	error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork);
 	if (error)
 		return(error);
 	bp->b_ops = &xfs_da3_node_buf_ops;
@@ -377,10 +331,10 @@ xfs_da3_node_create(
 	}
 	ichdr.level = level;
 
-	xfs_da3_node_hdr_to_disk(node, &ichdr);
+	dp->d_ops->node_hdr_to_disk(node, &ichdr);
 	xfs_trans_log_buf(tp, bp,
 		XFS_DA_LOGRANGE(node, &node->hdr,
-				args->dp->d_ops->node_hdr_size()));
+				dp->d_ops->node_hdr_size()));
 
 	*bpp = bp;
 	return(0);
@@ -590,7 +544,7 @@ xfs_da3_root_split(
 	    oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
 		struct xfs_da3_icnode_hdr nodehdr;
 
-		xfs_da3_node_hdr_from_disk(&nodehdr, oldroot);
+		dp->d_ops->node_hdr_from_disk(&nodehdr, oldroot);
 		btree = dp->d_ops->node_tree_p(oldroot);
 		size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot);
 		level = nodehdr.level;
@@ -605,7 +559,7 @@ xfs_da3_root_split(
 		struct xfs_dir2_leaf_entry *ents;
 
 		leaf = (xfs_dir2_leaf_t *)oldroot;
-		xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+		dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
 		ents = dp->d_ops->leaf_ents_p(leaf);
 
 		ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
@@ -650,14 +604,14 @@ xfs_da3_root_split(
 		return error;
 
 	node = bp->b_addr;
-	xfs_da3_node_hdr_from_disk(&nodehdr, node);
+	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
 	btree = dp->d_ops->node_tree_p(node);
 	btree[0].hashval = cpu_to_be32(blk1->hashval);
 	btree[0].before = cpu_to_be32(blk1->blkno);
 	btree[1].hashval = cpu_to_be32(blk2->hashval);
 	btree[1].before = cpu_to_be32(blk2->blkno);
 	nodehdr.count = 2;
-	xfs_da3_node_hdr_to_disk(node, &nodehdr);
+	dp->d_ops->node_hdr_to_disk(node, &nodehdr);
 
 #ifdef DEBUG
 	if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
@@ -694,11 +648,12 @@ xfs_da3_node_split(
 	int			newcount;
 	int			error;
 	int			useextra;
+	struct xfs_inode	*dp = state->args->dp;
 
 	trace_xfs_da_node_split(state->args);
 
 	node = oldblk->bp->b_addr;
-	xfs_da3_node_hdr_from_disk(&nodehdr, node);
+	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
 
 	/*
 	 * With V2 dirs the extra block is data or freespace.
@@ -745,7 +700,7 @@ xfs_da3_node_split(
 	 * If we had double-split op below us, then add the extra block too.
 	 */
 	node = oldblk->bp->b_addr;
-	xfs_da3_node_hdr_from_disk(&nodehdr, node);
+	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
 	if (oldblk->index <= nodehdr.count) {
 		oldblk->index++;
 		xfs_da3_node_add(state, oldblk, addblk);
@@ -800,8 +755,8 @@ xfs_da3_node_rebalance(
 
 	node1 = blk1->bp->b_addr;
 	node2 = blk2->bp->b_addr;
-	xfs_da3_node_hdr_from_disk(&nodehdr1, node1);
-	xfs_da3_node_hdr_from_disk(&nodehdr2, node2);
+	dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+	dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
 	btree1 = dp->d_ops->node_tree_p(node1);
 	btree2 = dp->d_ops->node_tree_p(node2);
 
@@ -816,8 +771,8 @@ xfs_da3_node_rebalance(
 		tmpnode = node1;
 		node1 = node2;
 		node2 = tmpnode;
-		xfs_da3_node_hdr_from_disk(&nodehdr1, node1);
-		xfs_da3_node_hdr_from_disk(&nodehdr2, node2);
+		dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+		dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
 		btree1 = dp->d_ops->node_tree_p(node1);
 		btree2 = dp->d_ops->node_tree_p(node2);
 		swap = 1;
@@ -881,12 +836,12 @@ xfs_da3_node_rebalance(
 	/*
 	 * Log header of node 1 and all current bits of node 2.
 	 */
-	xfs_da3_node_hdr_to_disk(node1, &nodehdr1);
+	dp->d_ops->node_hdr_to_disk(node1, &nodehdr1);
 	xfs_trans_log_buf(tp, blk1->bp,
 		XFS_DA_LOGRANGE(node1, &node1->hdr,
 				dp->d_ops->node_hdr_size()));
 
-	xfs_da3_node_hdr_to_disk(node2, &nodehdr2);
+	dp->d_ops->node_hdr_to_disk(node2, &nodehdr2);
 	xfs_trans_log_buf(tp, blk2->bp,
 		XFS_DA_LOGRANGE(node2, &node2->hdr,
 				dp->d_ops->node_hdr_size() +
@@ -899,8 +854,8 @@ xfs_da3_node_rebalance(
 	if (swap) {
 		node1 = blk1->bp->b_addr;
 		node2 = blk2->bp->b_addr;
-		xfs_da3_node_hdr_from_disk(&nodehdr1, node1);
-		xfs_da3_node_hdr_from_disk(&nodehdr2, node2);
+		dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+		dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
 		btree1 = dp->d_ops->node_tree_p(node1);
 		btree2 = dp->d_ops->node_tree_p(node2);
 	}
@@ -934,7 +889,7 @@ xfs_da3_node_add(
 	trace_xfs_da_node_add(state->args);
 
 	node = oldblk->bp->b_addr;
-	xfs_da3_node_hdr_from_disk(&nodehdr, node);
+	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
 	btree = dp->d_ops->node_tree_p(node);
 
 	ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count);
@@ -958,7 +913,7 @@ xfs_da3_node_add(
 				tmp + sizeof(*btree)));
 
 	nodehdr.count += 1;
-	xfs_da3_node_hdr_to_disk(node, &nodehdr);
+	dp->d_ops->node_hdr_to_disk(node, &nodehdr);
 	xfs_trans_log_buf(state->args->trans, oldblk->bp,
 		XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size()));
 
@@ -1097,6 +1052,7 @@ xfs_da3_root_join(
 	struct xfs_da3_icnode_hdr oldroothdr;
 	struct xfs_da_node_entry *btree;
 	int			error;
+	struct xfs_inode	*dp = state->args->dp;
 
 	trace_xfs_da_root_join(state->args);
 
@@ -1104,7 +1060,7 @@ xfs_da3_root_join(
 
 	args = state->args;
 	oldroot = root_blk->bp->b_addr;
-	xfs_da3_node_hdr_from_disk(&oldroothdr, oldroot);
+	dp->d_ops->node_hdr_from_disk(&oldroothdr, oldroot);
 	ASSERT(oldroothdr.forw == 0);
 	ASSERT(oldroothdr.back == 0);
 
@@ -1118,10 +1074,10 @@ xfs_da3_root_join(
 	 * Read in the (only) child block, then copy those bytes into
 	 * the root block's buffer and free the original child block.
 	 */
-	btree = args->dp->d_ops->node_tree_p(oldroot);
+	btree = dp->d_ops->node_tree_p(oldroot);
 	child = be32_to_cpu(btree[0].before);
 	ASSERT(child != 0);
-	error = xfs_da3_node_read(args->trans, args->dp, child, -1, &bp,
+	error = xfs_da3_node_read(args->trans, dp, child, -1, &bp,
 					     args->whichfork);
 	if (error)
 		return error;
@@ -1171,6 +1127,7 @@ xfs_da3_node_toosmall(
 	int			error;
 	int			retval;
 	int			i;
+	struct xfs_inode	*dp = state->args->dp;
 
 	trace_xfs_da_node_toosmall(state->args);
 
@@ -1182,7 +1139,7 @@ xfs_da3_node_toosmall(
 	blk = &state->path.blk[ state->path.active-1 ];
 	info = blk->bp->b_addr;
 	node = (xfs_da_intnode_t *)info;
-	xfs_da3_node_hdr_from_disk(&nodehdr, node);
+	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
 	if (nodehdr.count > (state->node_ents >> 1)) {
 		*action = 0;	/* blk over 50%, don't try to join */
 		return(0);	/* blk over 50%, don't try to join */
@@ -1234,13 +1191,13 @@ xfs_da3_node_toosmall(
 			blkno = nodehdr.back;
 		if (blkno == 0)
 			continue;
-		error = xfs_da3_node_read(state->args->trans, state->args->dp,
+		error = xfs_da3_node_read(state->args->trans, dp,
 					blkno, -1, &bp, state->args->whichfork);
 		if (error)
 			return(error);
 
 		node = bp->b_addr;
-		xfs_da3_node_hdr_from_disk(&thdr, node);
+		dp->d_ops->node_hdr_from_disk(&thdr, node);
 		xfs_trans_brelse(state->args->trans, bp);
 
 		if (count - thdr.count >= 0)
@@ -1287,7 +1244,7 @@ xfs_da3_node_lasthash(
 	struct xfs_da3_icnode_hdr nodehdr;
 
 	node = bp->b_addr;
-	xfs_da3_node_hdr_from_disk(&nodehdr, node);
+	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
 	if (count)
 		*count = nodehdr.count;
 	if (!nodehdr.count)
@@ -1338,7 +1295,7 @@ xfs_da3_fixhashpath(
 		struct xfs_da3_icnode_hdr nodehdr;
 
 		node = blk->bp->b_addr;
-		xfs_da3_node_hdr_from_disk(&nodehdr, node);
+		dp->d_ops->node_hdr_from_disk(&nodehdr, node);
 		btree = dp->d_ops->node_tree_p(node);
 		if (be32_to_cpu(btree->hashval) == lasthash)
 			break;
@@ -1370,7 +1327,7 @@ xfs_da3_node_remove(
 	trace_xfs_da_node_remove(state->args);
 
 	node = drop_blk->bp->b_addr;
-	xfs_da3_node_hdr_from_disk(&nodehdr, node);
+	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
 	ASSERT(drop_blk->index < nodehdr.count);
 	ASSERT(drop_blk->index >= 0);
 
@@ -1391,7 +1348,7 @@ xfs_da3_node_remove(
 	xfs_trans_log_buf(state->args->trans, drop_blk->bp,
 	    XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index])));
 	nodehdr.count -= 1;
-	xfs_da3_node_hdr_to_disk(node, &nodehdr);
+	dp->d_ops->node_hdr_to_disk(node, &nodehdr);
 	xfs_trans_log_buf(state->args->trans, drop_blk->bp,
 	    XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size()));
 
@@ -1426,8 +1383,8 @@ xfs_da3_node_unbalance(
 
 	drop_node = drop_blk->bp->b_addr;
 	save_node = save_blk->bp->b_addr;
-	xfs_da3_node_hdr_from_disk(&drop_hdr, drop_node);
-	xfs_da3_node_hdr_from_disk(&save_hdr, save_node);
+	dp->d_ops->node_hdr_from_disk(&drop_hdr, drop_node);
+	dp->d_ops->node_hdr_from_disk(&save_hdr, save_node);
 	drop_btree = dp->d_ops->node_tree_p(drop_node);
 	save_btree = dp->d_ops->node_tree_p(save_node);
 	tp = state->args->trans;
@@ -1463,7 +1420,7 @@ xfs_da3_node_unbalance(
 	memcpy(&save_btree[sindex], &drop_btree[0], tmp);
 	save_hdr.count += drop_hdr.count;
 
-	xfs_da3_node_hdr_to_disk(save_node, &save_hdr);
+	dp->d_ops->node_hdr_to_disk(save_node, &save_hdr);
 	xfs_trans_log_buf(tp, save_blk->bp,
 		XFS_DA_LOGRANGE(save_node, &save_node->hdr,
 				dp->d_ops->node_hdr_size()));
@@ -1556,7 +1513,7 @@ xfs_da3_node_lookup_int(
 		 * Search an intermediate node for a match.
 		 */
 		node = blk->bp->b_addr;
-		xfs_da3_node_hdr_from_disk(&nodehdr, node);
+		dp->d_ops->node_hdr_from_disk(&nodehdr, node);
 		btree = dp->d_ops->node_tree_p(node);
 
 		max = nodehdr.count;
@@ -1665,8 +1622,8 @@ xfs_da3_node_order(
 
 	node1 = node1_bp->b_addr;
 	node2 = node2_bp->b_addr;
-	xfs_da3_node_hdr_from_disk(&node1hdr, node1);
-	xfs_da3_node_hdr_from_disk(&node2hdr, node2);
+	dp->d_ops->node_hdr_from_disk(&node1hdr, node1);
+	dp->d_ops->node_hdr_from_disk(&node2hdr, node2);
 	btree1 = dp->d_ops->node_tree_p(node1);
 	btree2 = dp->d_ops->node_tree_p(node2);
 
@@ -1888,7 +1845,7 @@ xfs_da3_path_shift(
 	level = (path->active-1) - 1;	/* skip bottom layer in path */
 	for (blk = &path->blk[level]; level >= 0; blk--, level--) {
 		node = blk->bp->b_addr;
-		xfs_da3_node_hdr_from_disk(&nodehdr, node);
+		dp->d_ops->node_hdr_from_disk(&nodehdr, node);
 		btree = dp->d_ops->node_tree_p(node);
 
 		if (forward && (blk->index < nodehdr.count - 1)) {
@@ -1945,7 +1902,7 @@ xfs_da3_path_shift(
 		case XFS_DA3_NODE_MAGIC:
 			blk->magic = XFS_DA_NODE_MAGIC;
 			node = (xfs_da_intnode_t *)info;
-			xfs_da3_node_hdr_from_disk(&nodehdr, node);
+			dp->d_ops->node_hdr_from_disk(&nodehdr, node);
 			btree = dp->d_ops->node_tree_p(node);
 			blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
 			if (forward)
@@ -2233,7 +2190,7 @@ xfs_da3_swap_lastblock(
 		struct xfs_dir2_leaf_entry *ents;
 
 		dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
-		xfs_dir3_leaf_hdr_from_disk(&leafhdr, dead_leaf2);
+		dp->d_ops->leaf_hdr_from_disk(&leafhdr, dead_leaf2);
 		ents = dp->d_ops->leaf_ents_p(dead_leaf2);
 		dead_level = 0;
 		dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval);
@@ -2241,7 +2198,7 @@ xfs_da3_swap_lastblock(
 		struct xfs_da3_icnode_hdr deadhdr;
 
 		dead_node = (xfs_da_intnode_t *)dead_info;
-		xfs_da3_node_hdr_from_disk(&deadhdr, dead_node);
+		dp->d_ops->node_hdr_from_disk(&deadhdr, dead_node);
 		btree = dp->d_ops->node_tree_p(dead_node);
 		dead_level = deadhdr.level;
 		dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval);
@@ -2301,7 +2258,7 @@ xfs_da3_swap_lastblock(
 		if (error)
 			goto done;
 		par_node = par_buf->b_addr;
-		xfs_da3_node_hdr_from_disk(&par_hdr, par_node);
+		dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
 		if (level >= 0 && level != par_hdr.level + 1) {
 			XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
 					 XFS_ERRLEVEL_LOW, mp);
@@ -2352,7 +2309,7 @@ xfs_da3_swap_lastblock(
 		if (error)
 			goto done;
 		par_node = par_buf->b_addr;
-		xfs_da3_node_hdr_from_disk(&par_hdr, par_node);
+		dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
 		if (par_hdr.level != level) {
 			XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
 					 XFS_ERRLEVEL_LOW, mp);
diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/xfs_da_format.c
index 72b48b5ec69a..b232c275791d 100644
--- a/fs/xfs/xfs_da_format.c
+++ b/fs/xfs/xfs_da_format.c
@@ -464,19 +464,84 @@ xfs_dir3_leaf_hdr_size(void)
 	return sizeof(struct xfs_dir3_leaf_hdr);
 }
 
-static inline int
+static int
 xfs_dir3_max_leaf_ents(struct xfs_mount *mp)
 {
 	return (mp->m_dirblksize - xfs_dir3_leaf_hdr_size()) /
 		(uint)sizeof(struct xfs_dir2_leaf_entry);
 }
 
-static inline struct xfs_dir2_leaf_entry *
+static struct xfs_dir2_leaf_entry *
 xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp)
 {
 	return ((struct xfs_dir3_leaf *)lp)->__ents;
 }
 
+static void
+xfs_dir2_leaf_hdr_from_disk(
+	struct xfs_dir3_icleaf_hdr	*to,
+	struct xfs_dir2_leaf		*from)
+{
+	to->forw = be32_to_cpu(from->hdr.info.forw);
+	to->back = be32_to_cpu(from->hdr.info.back);
+	to->magic = be16_to_cpu(from->hdr.info.magic);
+	to->count = be16_to_cpu(from->hdr.count);
+	to->stale = be16_to_cpu(from->hdr.stale);
+
+	ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC ||
+	       to->magic == XFS_DIR2_LEAFN_MAGIC);
+}
+
+static void
+xfs_dir2_leaf_hdr_to_disk(
+	struct xfs_dir2_leaf		*to,
+	struct xfs_dir3_icleaf_hdr	*from)
+{
+	ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC ||
+	       from->magic == XFS_DIR2_LEAFN_MAGIC);
+
+	to->hdr.info.forw = cpu_to_be32(from->forw);
+	to->hdr.info.back = cpu_to_be32(from->back);
+	to->hdr.info.magic = cpu_to_be16(from->magic);
+	to->hdr.count = cpu_to_be16(from->count);
+	to->hdr.stale = cpu_to_be16(from->stale);
+}
+
+static void
+xfs_dir3_leaf_hdr_from_disk(
+	struct xfs_dir3_icleaf_hdr	*to,
+	struct xfs_dir2_leaf		*from)
+{
+	struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from;
+
+	to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+	to->back = be32_to_cpu(hdr3->info.hdr.back);
+	to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+	to->count = be16_to_cpu(hdr3->count);
+	to->stale = be16_to_cpu(hdr3->stale);
+
+	ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC ||
+	       to->magic == XFS_DIR3_LEAFN_MAGIC);
+}
+
+static void
+xfs_dir3_leaf_hdr_to_disk(
+	struct xfs_dir2_leaf		*to,
+	struct xfs_dir3_icleaf_hdr	*from)
+{
+	struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to;
+
+	ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC ||
+	       from->magic == XFS_DIR3_LEAFN_MAGIC);
+
+	hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+	hdr3->info.hdr.back = cpu_to_be32(from->back);
+	hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+	hdr3->count = cpu_to_be16(from->count);
+	hdr3->stale = cpu_to_be16(from->stale);
+}
+
+
 /*
  * Directory/Attribute Node block operations
  */
@@ -504,6 +569,121 @@ xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
 	return ((struct xfs_da3_intnode *)dap)->__btree;
 }
 
+static void
+xfs_da2_node_hdr_from_disk(
+	struct xfs_da3_icnode_hdr	*to,
+	struct xfs_da_intnode		*from)
+{
+	ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
+	to->forw = be32_to_cpu(from->hdr.info.forw);
+	to->back = be32_to_cpu(from->hdr.info.back);
+	to->magic = be16_to_cpu(from->hdr.info.magic);
+	to->count = be16_to_cpu(from->hdr.__count);
+	to->level = be16_to_cpu(from->hdr.__level);
+}
+
+static void
+xfs_da2_node_hdr_to_disk(
+	struct xfs_da_intnode		*to,
+	struct xfs_da3_icnode_hdr	*from)
+{
+	ASSERT(from->magic == XFS_DA_NODE_MAGIC);
+	to->hdr.info.forw = cpu_to_be32(from->forw);
+	to->hdr.info.back = cpu_to_be32(from->back);
+	to->hdr.info.magic = cpu_to_be16(from->magic);
+	to->hdr.__count = cpu_to_be16(from->count);
+	to->hdr.__level = cpu_to_be16(from->level);
+}
+
+static void
+xfs_da3_node_hdr_from_disk(
+	struct xfs_da3_icnode_hdr	*to,
+	struct xfs_da_intnode		*from)
+{
+	struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from;
+
+	ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
+	to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+	to->back = be32_to_cpu(hdr3->info.hdr.back);
+	to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+	to->count = be16_to_cpu(hdr3->__count);
+	to->level = be16_to_cpu(hdr3->__level);
+}
+
+static void
+xfs_da3_node_hdr_to_disk(
+	struct xfs_da_intnode		*to,
+	struct xfs_da3_icnode_hdr	*from)
+{
+	struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to;
+
+	ASSERT(from->magic == XFS_DA3_NODE_MAGIC);
+	hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+	hdr3->info.hdr.back = cpu_to_be32(from->back);
+	hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+	hdr3->__count = cpu_to_be16(from->count);
+	hdr3->__level = cpu_to_be16(from->level);
+}
+
+
+/*
+ * Directory free space block operations
+ */
+static void
+xfs_dir2_free_hdr_from_disk(
+	struct xfs_dir3_icfree_hdr	*to,
+	struct xfs_dir2_free		*from)
+{
+	to->magic = be32_to_cpu(from->hdr.magic);
+	to->firstdb = be32_to_cpu(from->hdr.firstdb);
+	to->nvalid = be32_to_cpu(from->hdr.nvalid);
+	to->nused = be32_to_cpu(from->hdr.nused);
+	ASSERT(to->magic == XFS_DIR2_FREE_MAGIC);
+}
+
+static void
+xfs_dir2_free_hdr_to_disk(
+	struct xfs_dir2_free		*to,
+	struct xfs_dir3_icfree_hdr	*from)
+{
+	ASSERT(from->magic == XFS_DIR2_FREE_MAGIC);
+
+	to->hdr.magic = cpu_to_be32(from->magic);
+	to->hdr.firstdb = cpu_to_be32(from->firstdb);
+	to->hdr.nvalid = cpu_to_be32(from->nvalid);
+	to->hdr.nused = cpu_to_be32(from->nused);
+}
+
+static void
+xfs_dir3_free_hdr_from_disk(
+	struct xfs_dir3_icfree_hdr	*to,
+	struct xfs_dir2_free		*from)
+{
+	struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from;
+
+	to->magic = be32_to_cpu(hdr3->hdr.magic);
+	to->firstdb = be32_to_cpu(hdr3->firstdb);
+	to->nvalid = be32_to_cpu(hdr3->nvalid);
+	to->nused = be32_to_cpu(hdr3->nused);
+
+	ASSERT(to->magic == XFS_DIR3_FREE_MAGIC);
+}
+
+static void
+xfs_dir3_free_hdr_to_disk(
+	struct xfs_dir2_free		*to,
+	struct xfs_dir3_icfree_hdr	*from)
+{
+	struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to;
+
+	ASSERT(from->magic == XFS_DIR3_FREE_MAGIC);
+
+	hdr3->hdr.magic = cpu_to_be32(from->magic);
+	hdr3->firstdb = cpu_to_be32(from->firstdb);
+	hdr3->nvalid = cpu_to_be32(from->nvalid);
+	hdr3->nused = cpu_to_be32(from->nused);
+}
+
 const struct xfs_dir_ops xfs_dir2_ops = {
 	.sf_entsize = xfs_dir2_sf_entsize,
 	.sf_nextentry = xfs_dir2_sf_nextentry,
@@ -532,11 +712,18 @@ const struct xfs_dir_ops xfs_dir2_ops = {
 	.data_unused_p = xfs_dir2_data_unused_p,
 
 	.leaf_hdr_size = xfs_dir2_leaf_hdr_size,
+	.leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
+	.leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
 	.leaf_max_ents = xfs_dir2_max_leaf_ents,
 	.leaf_ents_p = xfs_dir2_leaf_ents_p,
 
 	.node_hdr_size = xfs_da2_node_hdr_size,
+	.node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+	.node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
 	.node_tree_p = xfs_da2_node_tree_p,
+
+	.free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
+	.free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
 };
 
 const struct xfs_dir_ops xfs_dir2_ftype_ops = {
@@ -567,11 +754,18 @@ const struct xfs_dir_ops xfs_dir2_ftype_ops = {
 	.data_unused_p = xfs_dir2_data_unused_p,
 
 	.leaf_hdr_size = xfs_dir2_leaf_hdr_size,
+	.leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
+	.leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
 	.leaf_max_ents = xfs_dir2_max_leaf_ents,
 	.leaf_ents_p = xfs_dir2_leaf_ents_p,
 
 	.node_hdr_size = xfs_da2_node_hdr_size,
+	.node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+	.node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
 	.node_tree_p = xfs_da2_node_tree_p,
+
+	.free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
+	.free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
 };
 
 const struct xfs_dir_ops xfs_dir3_ops = {
@@ -602,20 +796,31 @@ const struct xfs_dir_ops xfs_dir3_ops = {
 	.data_unused_p = xfs_dir3_data_unused_p,
 
 	.leaf_hdr_size = xfs_dir3_leaf_hdr_size,
+	.leaf_hdr_to_disk = xfs_dir3_leaf_hdr_to_disk,
+	.leaf_hdr_from_disk = xfs_dir3_leaf_hdr_from_disk,
 	.leaf_max_ents = xfs_dir3_max_leaf_ents,
 	.leaf_ents_p = xfs_dir3_leaf_ents_p,
 
 	.node_hdr_size = xfs_da3_node_hdr_size,
+	.node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
+	.node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
 	.node_tree_p = xfs_da3_node_tree_p,
+
+	.free_hdr_to_disk = xfs_dir3_free_hdr_to_disk,
+	.free_hdr_from_disk = xfs_dir3_free_hdr_from_disk,
 };
 
 const struct xfs_dir_ops xfs_dir2_nondir_ops = {
 	.node_hdr_size = xfs_da2_node_hdr_size,
+	.node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+	.node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
 	.node_tree_p = xfs_da2_node_tree_p,
 };
 
 const struct xfs_dir_ops xfs_dir3_nondir_ops = {
 	.node_hdr_size = xfs_da3_node_hdr_size,
+	.node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
+	.node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
 	.node_tree_p = xfs_da3_node_tree_p,
 };
 
diff --git a/fs/xfs/xfs_da_format.h b/fs/xfs/xfs_da_format.h
index 69b4c6e1c52e..1ff7b9f42097 100644
--- a/fs/xfs/xfs_da_format.h
+++ b/fs/xfs/xfs_da_format.h
@@ -122,16 +122,6 @@ struct xfs_da3_icnode_hdr {
 	__uint16_t	level;
 };
 
-extern void xfs_da3_node_hdr_from_disk(struct xfs_da3_icnode_hdr *to,
-				       struct xfs_da_intnode *from);
-extern void xfs_da3_node_hdr_to_disk(struct xfs_da_intnode *to,
-				     struct xfs_da3_icnode_hdr *from);
-
-extern void xfs_da3_intnode_from_disk(struct xfs_da3_icnode_hdr *to,
-				      struct xfs_da_intnode *from);
-extern void xfs_da3_intnode_to_disk(struct xfs_da_intnode *to,
-				    struct xfs_da3_icnode_hdr *from);
-
 #define	XFS_LBSIZE(mp)	(mp)->m_sb.sb_blocksize
 
 /*
@@ -523,9 +513,6 @@ struct xfs_dir3_leaf {
 
 #define XFS_DIR3_LEAF_CRC_OFF  offsetof(struct xfs_dir3_leaf_hdr, info.crc)
 
-extern void xfs_dir3_leaf_hdr_from_disk(struct xfs_dir3_icleaf_hdr *to,
-					struct xfs_dir2_leaf *from);
-
 /*
  * Get address of the bestcount field in the single-leaf block.
  */
@@ -717,9 +704,6 @@ struct xfs_dir3_icfree_hdr {
 
 };
 
-void xfs_dir3_free_hdr_from_disk(struct xfs_dir3_icfree_hdr *to,
-				 struct xfs_dir2_free *from);
-
 static inline int
 xfs_dir3_free_hdr_size(struct xfs_mount *mp)
 {
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index c5cad9d9239d..61195348f667 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -76,13 +76,26 @@ struct xfs_dir_ops {
 		(*data_unused_p)(struct xfs_dir2_data_hdr *hdr);
 
 	int	(*leaf_hdr_size)(void);
+	void	(*leaf_hdr_to_disk)(struct xfs_dir2_leaf *to,
+				    struct xfs_dir3_icleaf_hdr *from);
+	void	(*leaf_hdr_from_disk)(struct xfs_dir3_icleaf_hdr *to,
+				      struct xfs_dir2_leaf *from);
 	int	(*leaf_max_ents)(struct xfs_mount *mp);
 	struct xfs_dir2_leaf_entry *
 		(*leaf_ents_p)(struct xfs_dir2_leaf *lp);
 
 	int	(*node_hdr_size)(void);
+	void	(*node_hdr_to_disk)(struct xfs_da_intnode *to,
+				    struct xfs_da3_icnode_hdr *from);
+	void	(*node_hdr_from_disk)(struct xfs_da3_icnode_hdr *to,
+				      struct xfs_da_intnode *from);
 	struct xfs_da_node_entry *
 		(*node_tree_p)(struct xfs_da_intnode *dap);
+
+	void	(*free_hdr_to_disk)(struct xfs_dir2_free *to,
+				    struct xfs_dir3_icfree_hdr *from);
+	void	(*free_hdr_from_disk)(struct xfs_dir3_icfree_hdr *to,
+				      struct xfs_dir2_free *from);
 };
 
 extern const struct xfs_dir_ops *
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 75f815786b97..c1ff552aa7bf 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -936,7 +936,7 @@ xfs_dir2_leaf_to_block(
 	tp = args->trans;
 	mp = dp->i_mount;
 	leaf = lbp->b_addr;
-	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
 	ents = dp->d_ops->leaf_ents_p(leaf);
 	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
 
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 16fdc0e627a7..2fb8db9fb574 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -64,7 +64,7 @@ xfs_dir3_leaf1_check(
 	struct xfs_dir2_leaf	*leaf = bp->b_addr;
 	struct xfs_dir3_icleaf_hdr leafhdr;
 
-	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
 
 	if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) {
 		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
@@ -79,62 +79,6 @@ xfs_dir3_leaf1_check(
 #define	xfs_dir3_leaf_check(dp, bp)
 #endif
 
-void
-xfs_dir3_leaf_hdr_from_disk(
-	struct xfs_dir3_icleaf_hdr	*to,
-	struct xfs_dir2_leaf		*from)
-{
-	if (from->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
-	    from->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) {
-		to->forw = be32_to_cpu(from->hdr.info.forw);
-		to->back = be32_to_cpu(from->hdr.info.back);
-		to->magic = be16_to_cpu(from->hdr.info.magic);
-		to->count = be16_to_cpu(from->hdr.count);
-		to->stale = be16_to_cpu(from->hdr.stale);
-	} else {
-		struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from;
-
-		to->forw = be32_to_cpu(hdr3->info.hdr.forw);
-		to->back = be32_to_cpu(hdr3->info.hdr.back);
-		to->magic = be16_to_cpu(hdr3->info.hdr.magic);
-		to->count = be16_to_cpu(hdr3->count);
-		to->stale = be16_to_cpu(hdr3->stale);
-	}
-
-	ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC ||
-	       to->magic == XFS_DIR3_LEAF1_MAGIC ||
-	       to->magic == XFS_DIR2_LEAFN_MAGIC ||
-	       to->magic == XFS_DIR3_LEAFN_MAGIC);
-}
-
-void
-xfs_dir3_leaf_hdr_to_disk(
-	struct xfs_dir2_leaf		*to,
-	struct xfs_dir3_icleaf_hdr	*from)
-{
-	ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC ||
-	       from->magic == XFS_DIR3_LEAF1_MAGIC ||
-	       from->magic == XFS_DIR2_LEAFN_MAGIC ||
-	       from->magic == XFS_DIR3_LEAFN_MAGIC);
-
-	if (from->magic == XFS_DIR2_LEAF1_MAGIC ||
-	    from->magic == XFS_DIR2_LEAFN_MAGIC) {
-		to->hdr.info.forw = cpu_to_be32(from->forw);
-		to->hdr.info.back = cpu_to_be32(from->back);
-		to->hdr.info.magic = cpu_to_be16(from->magic);
-		to->hdr.count = cpu_to_be16(from->count);
-		to->hdr.stale = cpu_to_be16(from->stale);
-	} else {
-		struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to;
-
-		hdr3->info.hdr.forw = cpu_to_be32(from->forw);
-		hdr3->info.hdr.back = cpu_to_be32(from->back);
-		hdr3->info.hdr.magic = cpu_to_be16(from->magic);
-		hdr3->count = cpu_to_be16(from->count);
-		hdr3->stale = cpu_to_be16(from->stale);
-	}
-}
-
 bool
 xfs_dir3_leaf_check_int(
 	struct xfs_mount	*mp,
@@ -147,6 +91,7 @@ xfs_dir3_leaf_check_int(
 	int			stale;
 	int			i;
 	const struct xfs_dir_ops *ops;
+	struct xfs_dir3_icleaf_hdr leafhdr;
 
 	/*
 	 * we can be passed a null dp here from a verifier, so we need to go the
@@ -154,6 +99,11 @@ xfs_dir3_leaf_check_int(
 	 */
 	ops = xfs_dir_get_ops(mp, dp);
 
+	if (!hdr) {
+		ops->leaf_hdr_from_disk(&leafhdr, leaf);
+		hdr = &leafhdr;
+	}
+
 	ents = ops->leaf_ents_p(leaf);
 	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
 
@@ -198,7 +148,6 @@ xfs_dir3_leaf_verify(
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
 	struct xfs_dir2_leaf	*leaf = bp->b_addr;
-	struct xfs_dir3_icleaf_hdr leafhdr;
 
 	ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
 
@@ -220,8 +169,7 @@ xfs_dir3_leaf_verify(
 			return false;
 	}
 
-	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
-	return xfs_dir3_leaf_check_int(mp, NULL, &leafhdr, leaf);
+	return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
 }
 
 static void
@@ -474,10 +422,10 @@ xfs_dir2_block_to_leaf(
 	/*
 	 * Set the counts in the leaf header.
 	 */
-	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
 	leafhdr.count = be32_to_cpu(btp->count);
 	leafhdr.stale = be32_to_cpu(btp->stale);
-	xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
+	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
 	xfs_dir3_leaf_log_header(tp, dp, lbp);
 
 	/*
@@ -706,7 +654,7 @@ xfs_dir2_leaf_addname(
 	leaf = lbp->b_addr;
 	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
 	ents = dp->d_ops->leaf_ents_p(leaf);
-	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
 	bestsp = xfs_dir2_leaf_bests_p(ltp);
 	length = dp->d_ops->data_entsize(args->namelen);
 
@@ -945,7 +893,7 @@ xfs_dir2_leaf_addname(
 	/*
 	 * Log the leaf fields and give up the buffers.
 	 */
-	xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
+	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
 	xfs_dir3_leaf_log_header(tp, dp, lbp);
 	xfs_dir3_leaf_log_ents(tp, dp, lbp, lfloglow, lfloghigh);
 	xfs_dir3_leaf_check(dp, lbp);
@@ -968,6 +916,7 @@ xfs_dir3_leaf_compact(
 	int		loglow;		/* first leaf entry to log */
 	int		to;		/* target leaf index */
 	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_inode *dp = args->dp;
 
 	leaf = bp->b_addr;
 	if (!leafhdr->stale)
@@ -976,7 +925,7 @@ xfs_dir3_leaf_compact(
 	/*
 	 * Compress out the stale entries in place.
 	 */
-	ents = args->dp->d_ops->leaf_ents_p(leaf);
+	ents = dp->d_ops->leaf_ents_p(leaf);
 	for (from = to = 0, loglow = -1; from < leafhdr->count; from++) {
 		if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
 			continue;
@@ -997,11 +946,10 @@ xfs_dir3_leaf_compact(
 	leafhdr->count -= leafhdr->stale;
 	leafhdr->stale = 0;
 
-	xfs_dir3_leaf_hdr_to_disk(leaf, leafhdr);
-	xfs_dir3_leaf_log_header(args->trans, args->dp, bp);
+	dp->d_ops->leaf_hdr_to_disk(leaf, leafhdr);
+	xfs_dir3_leaf_log_header(args->trans, dp, bp);
 	if (loglow != -1)
-		xfs_dir3_leaf_log_ents(args->trans, args->dp, bp,
-				       loglow, to - 1);
+		xfs_dir3_leaf_log_ents(args->trans, dp, bp, loglow, to - 1);
 }
 
 /*
@@ -1290,7 +1238,7 @@ xfs_dir2_leaf_lookup_int(
 	leaf = lbp->b_addr;
 	xfs_dir3_leaf_check(dp, lbp);
 	ents = dp->d_ops->leaf_ents_p(leaf);
-	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
 
 	/*
 	 * Look for the first leaf entry with our hash value.
@@ -1425,7 +1373,7 @@ xfs_dir2_leaf_removename(
 	hdr = dbp->b_addr;
 	xfs_dir3_data_check(dp, dbp);
 	bf = dp->d_ops->data_bestfree_p(hdr);
-	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
 	ents = dp->d_ops->leaf_ents_p(leaf);
 	/*
 	 * Point to the leaf entry, use that to point to the data entry.
@@ -1449,7 +1397,7 @@ xfs_dir2_leaf_removename(
 	 * We just mark the leaf entry stale by putting a null in it.
 	 */
 	leafhdr.stale++;
-	xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
+	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
 	xfs_dir3_leaf_log_header(tp, dp, lbp);
 
 	lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
@@ -1602,7 +1550,7 @@ xfs_dir2_leaf_search_hash(
 
 	leaf = lbp->b_addr;
 	ents = args->dp->d_ops->leaf_ents_p(leaf);
-	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	args->dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
 
 	/*
 	 * Note, the table cannot be empty, so we have to go through the loop.
@@ -1791,7 +1739,7 @@ xfs_dir2_node_to_leaf(
 		return 0;
 	lbp = state->path.blk[0].bp;
 	leaf = lbp->b_addr;
-	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
 
 	ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
 	       leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
@@ -1803,7 +1751,7 @@ xfs_dir2_node_to_leaf(
 	if (error)
 		return error;
 	free = fbp->b_addr;
-	xfs_dir3_free_hdr_from_disk(&freehdr, free);
+	dp->d_ops->free_hdr_from_disk(&freehdr, free);
 
 	ASSERT(!freehdr.firstdb);
 
@@ -1840,7 +1788,7 @@ xfs_dir2_node_to_leaf(
 	memcpy(xfs_dir2_leaf_bests_p(ltp), xfs_dir3_free_bests_p(mp, free),
 		freehdr.nvalid * sizeof(xfs_dir2_data_off_t));
 
-	xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
+	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
 	xfs_dir3_leaf_log_header(tp, dp, lbp);
 	xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
 	xfs_dir3_leaf_log_tail(tp, lbp);
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 3a426ed9b16e..eaee8c36bcc4 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -68,7 +68,7 @@ xfs_dir3_leafn_check(
 	struct xfs_dir2_leaf	*leaf = bp->b_addr;
 	struct xfs_dir3_icleaf_hdr leafhdr;
 
-	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
 
 	if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) {
 		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
@@ -192,53 +192,6 @@ xfs_dir2_free_try_read(
 	return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp);
 }
 
-
-void
-xfs_dir3_free_hdr_from_disk(
-	struct xfs_dir3_icfree_hdr	*to,
-	struct xfs_dir2_free		*from)
-{
-	if (from->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)) {
-		to->magic = be32_to_cpu(from->hdr.magic);
-		to->firstdb = be32_to_cpu(from->hdr.firstdb);
-		to->nvalid = be32_to_cpu(from->hdr.nvalid);
-		to->nused = be32_to_cpu(from->hdr.nused);
-	} else {
-		struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from;
-
-		to->magic = be32_to_cpu(hdr3->hdr.magic);
-		to->firstdb = be32_to_cpu(hdr3->firstdb);
-		to->nvalid = be32_to_cpu(hdr3->nvalid);
-		to->nused = be32_to_cpu(hdr3->nused);
-	}
-
-	ASSERT(to->magic == XFS_DIR2_FREE_MAGIC ||
-	       to->magic == XFS_DIR3_FREE_MAGIC);
-}
-
-static void
-xfs_dir3_free_hdr_to_disk(
-	struct xfs_dir2_free		*to,
-	struct xfs_dir3_icfree_hdr	*from)
-{
-	ASSERT(from->magic == XFS_DIR2_FREE_MAGIC ||
-	       from->magic == XFS_DIR3_FREE_MAGIC);
-
-	if (from->magic == XFS_DIR2_FREE_MAGIC) {
-		to->hdr.magic = cpu_to_be32(from->magic);
-		to->hdr.firstdb = cpu_to_be32(from->firstdb);
-		to->hdr.nvalid = cpu_to_be32(from->nvalid);
-		to->hdr.nused = cpu_to_be32(from->nused);
-	} else {
-		struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to;
-
-		hdr3->hdr.magic = cpu_to_be32(from->magic);
-		hdr3->firstdb = cpu_to_be32(from->firstdb);
-		hdr3->nvalid = cpu_to_be32(from->nvalid);
-		hdr3->nused = cpu_to_be32(from->nused);
-	}
-}
-
 static int
 xfs_dir3_free_get_buf(
 	struct xfs_trans	*tp,
@@ -276,7 +229,7 @@ xfs_dir3_free_get_buf(
 		uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid);
 	} else
 		hdr.magic = XFS_DIR2_FREE_MAGIC;
-	xfs_dir3_free_hdr_to_disk(bp->b_addr, &hdr);
+	dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr);
 	*bpp = bp;
 	return 0;
 }
@@ -368,7 +321,7 @@ xfs_dir2_leaf_to_node(
 		return error;
 
 	free = fbp->b_addr;
-	xfs_dir3_free_hdr_from_disk(&freehdr, free);
+	dp->d_ops->free_hdr_from_disk(&freehdr, free);
 	leaf = lbp->b_addr;
 	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
 	ASSERT(be32_to_cpu(ltp->bestcount) <=
@@ -392,7 +345,7 @@ xfs_dir2_leaf_to_node(
 	freehdr.nused = n;
 	freehdr.nvalid = be32_to_cpu(ltp->bestcount);
 
-	xfs_dir3_free_hdr_to_disk(fbp->b_addr, &freehdr);
+	dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
 	xfs_dir2_free_log_bests(tp, fbp, 0, freehdr.nvalid - 1);
 	xfs_dir2_free_log_header(tp, fbp);
 
@@ -442,7 +395,7 @@ xfs_dir2_leafn_add(
 	mp = dp->i_mount;
 	tp = args->trans;
 	leaf = bp->b_addr;
-	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
 	ents = dp->d_ops->leaf_ents_p(leaf);
 
 	/*
@@ -497,7 +450,7 @@ xfs_dir2_leafn_add(
 	lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp,
 				args->blkno, args->index));
 
-	xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
+	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
 	xfs_dir3_leaf_log_header(tp, dp, bp);
 	xfs_dir3_leaf_log_ents(tp, dp, bp, lfloglow, lfloghigh);
 	xfs_dir3_leaf_check(dp, bp);
@@ -507,20 +460,20 @@ xfs_dir2_leafn_add(
 #ifdef DEBUG
 static void
 xfs_dir2_free_hdr_check(
-	struct xfs_mount *mp,
+	struct xfs_inode *dp,
 	struct xfs_buf	*bp,
 	xfs_dir2_db_t	db)
 {
 	struct xfs_dir3_icfree_hdr hdr;
 
-	xfs_dir3_free_hdr_from_disk(&hdr, bp->b_addr);
+	dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr);
 
-	ASSERT((hdr.firstdb % xfs_dir3_free_max_bests(mp)) == 0);
+	ASSERT((hdr.firstdb % xfs_dir3_free_max_bests(dp->i_mount)) == 0);
 	ASSERT(hdr.firstdb <= db);
 	ASSERT(db < hdr.firstdb + hdr.nvalid);
 }
 #else
-#define xfs_dir2_free_hdr_check(mp, dp, db)
+#define xfs_dir2_free_hdr_check(dp, bp, db)
 #endif	/* DEBUG */
 
 /*
@@ -537,7 +490,7 @@ xfs_dir2_leafn_lasthash(
 	struct xfs_dir2_leaf_entry *ents;
 	struct xfs_dir3_icleaf_hdr leafhdr;
 
-	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
 
 	ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
 	       leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
@@ -584,7 +537,7 @@ xfs_dir2_leafn_lookup_for_addname(
 	tp = args->trans;
 	mp = dp->i_mount;
 	leaf = bp->b_addr;
-	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
 	ents = dp->d_ops->leaf_ents_p(leaf);
 
 	xfs_dir3_leaf_check(dp, bp);
@@ -655,7 +608,7 @@ xfs_dir2_leafn_lookup_for_addname(
 					return error;
 				free = curbp->b_addr;
 
-				xfs_dir2_free_hdr_check(mp, curbp, curdb);
+				xfs_dir2_free_hdr_check(dp, curbp, curdb);
 			}
 			/*
 			 * Get the index for our entry.
@@ -734,7 +687,7 @@ xfs_dir2_leafn_lookup_for_entry(
 	tp = args->trans;
 	mp = dp->i_mount;
 	leaf = bp->b_addr;
-	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
 	ents = dp->d_ops->leaf_ents_p(leaf);
 
 	xfs_dir3_leaf_check(dp, bp);
@@ -969,8 +922,8 @@ xfs_dir2_leafn_order(
 	struct xfs_dir3_icleaf_hdr hdr1;
 	struct xfs_dir3_icleaf_hdr hdr2;
 
-	xfs_dir3_leaf_hdr_from_disk(&hdr1, leaf1);
-	xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf2);
+	dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
+	dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
 	ents1 = dp->d_ops->leaf_ents_p(leaf1);
 	ents2 = dp->d_ops->leaf_ents_p(leaf2);
 
@@ -1025,8 +978,8 @@ xfs_dir2_leafn_rebalance(
 	}
 	leaf1 = blk1->bp->b_addr;
 	leaf2 = blk2->bp->b_addr;
-	xfs_dir3_leaf_hdr_from_disk(&hdr1, leaf1);
-	xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf2);
+	dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
+	dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
 	ents1 = dp->d_ops->leaf_ents_p(leaf1);
 	ents2 = dp->d_ops->leaf_ents_p(leaf2);
 
@@ -1074,8 +1027,8 @@ xfs_dir2_leafn_rebalance(
 	ASSERT(hdr1.stale + hdr2.stale == oldstale);
 
 	/* log the changes made when moving the entries */
-	xfs_dir3_leaf_hdr_to_disk(leaf1, &hdr1);
-	xfs_dir3_leaf_hdr_to_disk(leaf2, &hdr2);
+	dp->d_ops->leaf_hdr_to_disk(leaf1, &hdr1);
+	dp->d_ops->leaf_hdr_to_disk(leaf2, &hdr2);
 	xfs_dir3_leaf_log_header(args->trans, dp, blk1->bp);
 	xfs_dir3_leaf_log_header(args->trans, dp, blk2->bp);
 
@@ -1124,8 +1077,9 @@ xfs_dir3_data_block_free(
 	int			logfree = 0;
 	__be16			*bests;
 	struct xfs_dir3_icfree_hdr freehdr;
+	struct xfs_inode	*dp = args->dp;
 
-	xfs_dir3_free_hdr_from_disk(&freehdr, free);
+	dp->d_ops->free_hdr_from_disk(&freehdr, free);
 
 	bests = xfs_dir3_free_bests_p(tp->t_mountp, free);
 	if (hdr) {
@@ -1161,7 +1115,7 @@ xfs_dir3_data_block_free(
 		logfree = 1;
 	}
 
-	xfs_dir3_free_hdr_to_disk(free, &freehdr);
+	dp->d_ops->free_hdr_to_disk(free, &freehdr);
 	xfs_dir2_free_log_header(tp, fbp);
 
 	/*
@@ -1226,7 +1180,7 @@ xfs_dir2_leafn_remove(
 	tp = args->trans;
 	mp = dp->i_mount;
 	leaf = bp->b_addr;
-	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
 	ents = dp->d_ops->leaf_ents_p(leaf);
 
 	/*
@@ -1247,7 +1201,7 @@ xfs_dir2_leafn_remove(
 	 * Log the leaf block changes.
 	 */
 	leafhdr.stale++;
-	xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
+	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
 	xfs_dir3_leaf_log_header(tp, dp, bp);
 
 	lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
@@ -1298,7 +1252,7 @@ xfs_dir2_leafn_remove(
 #ifdef DEBUG
 	{
 		struct xfs_dir3_icfree_hdr freehdr;
-		xfs_dir3_free_hdr_from_disk(&freehdr, free);
+		dp->d_ops->free_hdr_from_disk(&freehdr, free);
 		ASSERT(freehdr.firstdb == xfs_dir3_free_max_bests(mp) *
 					  (fdb - XFS_DIR2_FREE_FIRSTDB(mp)));
 	}
@@ -1449,7 +1403,7 @@ xfs_dir2_leafn_toosmall(
 	 */
 	blk = &state->path.blk[state->path.active - 1];
 	leaf = blk->bp->b_addr;
-	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
 	ents = dp->d_ops->leaf_ents_p(leaf);
 	xfs_dir3_leaf_check(dp, blk->bp);
 
@@ -1511,7 +1465,7 @@ xfs_dir2_leafn_toosmall(
 		bytes = state->blocksize - (state->blocksize >> 2);
 
 		leaf = bp->b_addr;
-		xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf);
+		dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf);
 		ents = dp->d_ops->leaf_ents_p(leaf);
 		count += hdr2.count - hdr2.stale;
 		bytes -= count * sizeof(ents[0]);
@@ -1574,10 +1528,10 @@ xfs_dir2_leafn_unbalance(
 	drop_leaf = drop_blk->bp->b_addr;
 	save_leaf = save_blk->bp->b_addr;
 
-	xfs_dir3_leaf_hdr_from_disk(&savehdr, save_leaf);
-	xfs_dir3_leaf_hdr_from_disk(&drophdr, drop_leaf);
-	sents = args->dp->d_ops->leaf_ents_p(save_leaf);
-	dents = args->dp->d_ops->leaf_ents_p(drop_leaf);
+	dp->d_ops->leaf_hdr_from_disk(&savehdr, save_leaf);
+	dp->d_ops->leaf_hdr_from_disk(&drophdr, drop_leaf);
+	sents = dp->d_ops->leaf_ents_p(save_leaf);
+	dents = dp->d_ops->leaf_ents_p(drop_leaf);
 
 	/*
 	 * If there are any stale leaf entries, take this opportunity
@@ -1603,8 +1557,8 @@ xfs_dir2_leafn_unbalance(
 	save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval);
 
 	/* log the changes made when moving the entries */
-	xfs_dir3_leaf_hdr_to_disk(save_leaf, &savehdr);
-	xfs_dir3_leaf_hdr_to_disk(drop_leaf, &drophdr);
+	dp->d_ops->leaf_hdr_to_disk(save_leaf, &savehdr);
+	dp->d_ops->leaf_hdr_to_disk(drop_leaf, &drophdr);
 	xfs_dir3_leaf_log_header(args->trans, dp, save_blk->bp);
 	xfs_dir3_leaf_log_header(args->trans, dp, drop_blk->bp);
 
@@ -1735,7 +1689,7 @@ xfs_dir2_node_addname_int(
 		free = fbp->b_addr;
 		findex = fblk->index;
 		bests = xfs_dir3_free_bests_p(mp, free);
-		xfs_dir3_free_hdr_from_disk(&freehdr, free);
+		dp->d_ops->free_hdr_from_disk(&freehdr, free);
 
 		/*
 		 * This means the free entry showed that the data block had
@@ -1828,7 +1782,7 @@ xfs_dir2_node_addname_int(
 		 * there, so we have to do it here to avoid warnings. Blech.
 		 */
 		bests = xfs_dir3_free_bests_p(mp, free);
-		xfs_dir3_free_hdr_from_disk(&freehdr, free);
+		dp->d_ops->free_hdr_from_disk(&freehdr, free);
 		if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
 		    be16_to_cpu(bests[findex]) >= length)
 			dbno = freehdr.firstdb + findex;
@@ -1927,7 +1881,7 @@ xfs_dir2_node_addname_int(
 				return error;
 			free = fbp->b_addr;
 			bests = xfs_dir3_free_bests_p(mp, free);
-			xfs_dir3_free_hdr_from_disk(&freehdr, free);
+			dp->d_ops->free_hdr_from_disk(&freehdr, free);
 
 			/*
 			 * Remember the first slot as our empty slot.
@@ -1937,7 +1891,7 @@ xfs_dir2_node_addname_int(
 		} else {
 			free = fbp->b_addr;
 			bests = xfs_dir3_free_bests_p(mp, free);
-			xfs_dir3_free_hdr_from_disk(&freehdr, free);
+			dp->d_ops->free_hdr_from_disk(&freehdr, free);
 		}
 
 		/*
@@ -1962,7 +1916,7 @@ xfs_dir2_node_addname_int(
 		 */
 		if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
 			freehdr.nused++;
-			xfs_dir3_free_hdr_to_disk(fbp->b_addr, &freehdr);
+			dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
 			xfs_dir2_free_log_header(tp, fbp);
 		}
 		/*
@@ -2294,7 +2248,7 @@ xfs_dir2_node_trim_free(
 	if (!bp)
 		return 0;
 	free = bp->b_addr;
-	xfs_dir3_free_hdr_from_disk(&freehdr, free);
+	dp->d_ops->free_hdr_from_disk(&freehdr, free);
 
 	/*
 	 * If there are used entries, there's nothing to do.
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 3e4b5ba7b41f..8b9d2281f85b 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -94,10 +94,6 @@ xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr,
 		int lowstale, int highstale, int *lfloglow, int *lfloghigh);
 extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state);
 
-extern void xfs_dir3_leaf_hdr_from_disk(struct xfs_dir3_icleaf_hdr *to,
-		struct xfs_dir2_leaf *from);
-extern void xfs_dir3_leaf_hdr_to_disk(struct xfs_dir2_leaf *to,
-		struct xfs_dir3_icleaf_hdr *from);
 extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, struct xfs_inode *dp,
 		struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf);
 
-- 
cgit v1.3


From 24dd0f546c366da68f33bbf2ecef2306f8c2281d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 30 Oct 2013 13:48:41 -0500
Subject: xfs: convert directory vector functions to constants

Next step in the vectorisation process is the directory free block
encode/decode operations. There are relatively few of these, though
there are quite a number of calls to them.

Binary sizes:

   text    data     bss     dec     hex filename
 794490   96802    1096  892388   d9de4 fs/xfs/xfs.o.orig
 792986   96802    1096  890884   d9804 fs/xfs/xfs.o.p1
 792350   96802    1096  890248   d9588 fs/xfs/xfs.o.p2
 789293   96802    1096  887191   d8997 fs/xfs/xfs.o.p3
 789005   96802    1096  886903   d8997 fs/xfs/xfs.o.p4
 789061   96802    1096  886959   d88af fs/xfs/xfs.o.p5
 789733   96802    1096  887631   d8b4f fs/xfs/xfs.o.p6
 791421   96802    1096  889319   d91e7 fs/xfs/xfs.o.p7
 791701   96802    1096  889599   d92ff fs/xfs/xfs.o.p8

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_da_format.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_da_format.h | 39 ----------------------
 fs/xfs/xfs_dir2.h      |  5 +++
 fs/xfs/xfs_dir2_leaf.c |  2 +-
 fs/xfs/xfs_dir2_node.c | 61 +++++++++++++++++-----------------
 5 files changed, 126 insertions(+), 70 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/xfs_da_format.c
index b232c275791d..01e6f9a9b804 100644
--- a/fs/xfs/xfs_da_format.c
+++ b/fs/xfs/xfs_da_format.c
@@ -629,6 +629,80 @@ xfs_da3_node_hdr_to_disk(
 /*
  * Directory free space block operations
  */
+static int
+xfs_dir2_free_hdr_size(void)
+{
+	return sizeof(struct xfs_dir2_free_hdr);
+}
+
+static int
+xfs_dir2_free_max_bests(struct xfs_mount *mp)
+{
+	return (mp->m_dirblksize - xfs_dir2_free_hdr_size()) /
+		sizeof(xfs_dir2_data_off_t);
+}
+
+static __be16 *
+xfs_dir2_free_bests_p(struct xfs_dir2_free *free)
+{
+	return (__be16 *)((char *)free + xfs_dir2_free_hdr_size());
+}
+
+/*
+ * Convert data space db to the corresponding free db.
+ */
+static xfs_dir2_db_t
+xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db)
+{
+	return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir2_free_max_bests(mp);
+}
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static int
+xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
+{
+	return db % xfs_dir2_free_max_bests(mp);
+}
+
+static int
+xfs_dir3_free_hdr_size(void)
+{
+	return sizeof(struct xfs_dir3_free_hdr);
+}
+
+static int
+xfs_dir3_free_max_bests(struct xfs_mount *mp)
+{
+	return (mp->m_dirblksize - xfs_dir3_free_hdr_size()) /
+		sizeof(xfs_dir2_data_off_t);
+}
+
+static __be16 *
+xfs_dir3_free_bests_p(struct xfs_dir2_free *free)
+{
+	return (__be16 *)((char *)free + xfs_dir3_free_hdr_size());
+}
+
+/*
+ * Convert data space db to the corresponding free db.
+ */
+static xfs_dir2_db_t
+xfs_dir3_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db)
+{
+	return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir3_free_max_bests(mp);
+}
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static int
+xfs_dir3_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
+{
+	return db % xfs_dir3_free_max_bests(mp);
+}
+
 static void
 xfs_dir2_free_hdr_from_disk(
 	struct xfs_dir3_icfree_hdr	*to,
@@ -722,8 +796,13 @@ const struct xfs_dir_ops xfs_dir2_ops = {
 	.node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
 	.node_tree_p = xfs_da2_node_tree_p,
 
+	.free_hdr_size = xfs_dir2_free_hdr_size,
 	.free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
 	.free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
+	.free_max_bests = xfs_dir2_free_max_bests,
+	.free_bests_p = xfs_dir2_free_bests_p,
+	.db_to_fdb = xfs_dir2_db_to_fdb,
+	.db_to_fdindex = xfs_dir2_db_to_fdindex,
 };
 
 const struct xfs_dir_ops xfs_dir2_ftype_ops = {
@@ -764,8 +843,13 @@ const struct xfs_dir_ops xfs_dir2_ftype_ops = {
 	.node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
 	.node_tree_p = xfs_da2_node_tree_p,
 
+	.free_hdr_size = xfs_dir2_free_hdr_size,
 	.free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
 	.free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
+	.free_max_bests = xfs_dir2_free_max_bests,
+	.free_bests_p = xfs_dir2_free_bests_p,
+	.db_to_fdb = xfs_dir2_db_to_fdb,
+	.db_to_fdindex = xfs_dir2_db_to_fdindex,
 };
 
 const struct xfs_dir_ops xfs_dir3_ops = {
@@ -806,8 +890,13 @@ const struct xfs_dir_ops xfs_dir3_ops = {
 	.node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
 	.node_tree_p = xfs_da3_node_tree_p,
 
+	.free_hdr_size = xfs_dir3_free_hdr_size,
 	.free_hdr_to_disk = xfs_dir3_free_hdr_to_disk,
 	.free_hdr_from_disk = xfs_dir3_free_hdr_from_disk,
+	.free_max_bests = xfs_dir3_free_max_bests,
+	.free_bests_p = xfs_dir3_free_bests_p,
+	.db_to_fdb = xfs_dir3_db_to_fdb,
+	.db_to_fdindex = xfs_dir3_db_to_fdindex,
 };
 
 const struct xfs_dir_ops xfs_dir2_nondir_ops = {
diff --git a/fs/xfs/xfs_da_format.h b/fs/xfs/xfs_da_format.h
index 1ff7b9f42097..a19d3f8f639c 100644
--- a/fs/xfs/xfs_da_format.h
+++ b/fs/xfs/xfs_da_format.h
@@ -704,45 +704,6 @@ struct xfs_dir3_icfree_hdr {
 
 };
 
-static inline int
-xfs_dir3_free_hdr_size(struct xfs_mount *mp)
-{
-	if (xfs_sb_version_hascrc(&mp->m_sb))
-		return sizeof(struct xfs_dir3_free_hdr);
-	return sizeof(struct xfs_dir2_free_hdr);
-}
-
-static inline int
-xfs_dir3_free_max_bests(struct xfs_mount *mp)
-{
-	return (mp->m_dirblksize - xfs_dir3_free_hdr_size(mp)) /
-		sizeof(xfs_dir2_data_off_t);
-}
-
-static inline __be16 *
-xfs_dir3_free_bests_p(struct xfs_mount *mp, struct xfs_dir2_free *free)
-{
-	return (__be16 *)((char *)free + xfs_dir3_free_hdr_size(mp));
-}
-
-/*
- * Convert data space db to the corresponding free db.
- */
-static inline xfs_dir2_db_t
-xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db)
-{
-	return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir3_free_max_bests(mp);
-}
-
-/*
- * Convert data space db to the corresponding index in a free db.
- */
-static inline int
-xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
-{
-	return db % xfs_dir3_free_max_bests(mp);
-}
-
 /*
  * Single block format.
  *
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 61195348f667..4264cb9199f8 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -92,10 +92,15 @@ struct xfs_dir_ops {
 	struct xfs_da_node_entry *
 		(*node_tree_p)(struct xfs_da_intnode *dap);
 
+	int	(*free_hdr_size)(void);
 	void	(*free_hdr_to_disk)(struct xfs_dir2_free *to,
 				    struct xfs_dir3_icfree_hdr *from);
 	void	(*free_hdr_from_disk)(struct xfs_dir3_icfree_hdr *to,
 				      struct xfs_dir2_free *from);
+	int	(*free_max_bests)(struct xfs_mount *mp);
+	__be16 * (*free_bests_p)(struct xfs_dir2_free *free);
+	xfs_dir2_db_t (*db_to_fdb)(struct xfs_mount *mp, xfs_dir2_db_t db);
+	int	(*db_to_fdindex)(struct xfs_mount *mp, xfs_dir2_db_t db);
 };
 
 extern const struct xfs_dir_ops *
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 2fb8db9fb574..a3c21b15cf7b 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1785,7 +1785,7 @@ xfs_dir2_node_to_leaf(
 	/*
 	 * Set up the leaf bests table.
 	 */
-	memcpy(xfs_dir2_leaf_bests_p(ltp), xfs_dir3_free_bests_p(mp, free),
+	memcpy(xfs_dir2_leaf_bests_p(ltp), dp->d_ops->free_bests_p(free),
 		freehdr.nvalid * sizeof(xfs_dir2_data_off_t));
 
 	dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index eaee8c36bcc4..853798519ae9 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -240,6 +240,7 @@ xfs_dir3_free_get_buf(
 STATIC void
 xfs_dir2_free_log_bests(
 	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
 	struct xfs_buf		*bp,
 	int			first,		/* first entry to log */
 	int			last)		/* last entry to log */
@@ -248,7 +249,7 @@ xfs_dir2_free_log_bests(
 	__be16			*bests;
 
 	free = bp->b_addr;
-	bests = xfs_dir3_free_bests_p(tp->t_mountp, free);
+	bests = dp->d_ops->free_bests_p(free);
 	ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
 	       free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
 	xfs_trans_log_buf(tp, bp,
@@ -263,6 +264,7 @@ xfs_dir2_free_log_bests(
 static void
 xfs_dir2_free_log_header(
 	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
 	struct xfs_buf		*bp)
 {
 #ifdef DEBUG
@@ -272,7 +274,7 @@ xfs_dir2_free_log_header(
 	ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
 	       free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
 #endif
-	xfs_trans_log_buf(tp, bp, 0, xfs_dir3_free_hdr_size(tp->t_mountp) - 1);
+	xfs_trans_log_buf(tp, bp, 0, dp->d_ops->free_hdr_size() - 1);
 }
 
 /*
@@ -332,7 +334,7 @@ xfs_dir2_leaf_to_node(
 	 * Count active entries.
 	 */
 	from = xfs_dir2_leaf_bests_p(ltp);
-	to = xfs_dir3_free_bests_p(mp, free);
+	to = dp->d_ops->free_bests_p(free);
 	for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) {
 		if ((off = be16_to_cpu(*from)) != NULLDATAOFF)
 			n++;
@@ -346,8 +348,8 @@ xfs_dir2_leaf_to_node(
 	freehdr.nvalid = be32_to_cpu(ltp->bestcount);
 
 	dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
-	xfs_dir2_free_log_bests(tp, fbp, 0, freehdr.nvalid - 1);
-	xfs_dir2_free_log_header(tp, fbp);
+	xfs_dir2_free_log_bests(tp, dp, fbp, 0, freehdr.nvalid - 1);
+	xfs_dir2_free_log_header(tp, dp, fbp);
 
 	/*
 	 * Converting the leaf to a leafnode is just a matter of changing the
@@ -468,7 +470,7 @@ xfs_dir2_free_hdr_check(
 
 	dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr);
 
-	ASSERT((hdr.firstdb % xfs_dir3_free_max_bests(dp->i_mount)) == 0);
+	ASSERT((hdr.firstdb % dp->d_ops->free_max_bests(dp->i_mount)) == 0);
 	ASSERT(hdr.firstdb <= db);
 	ASSERT(db < hdr.firstdb + hdr.nvalid);
 }
@@ -590,7 +592,7 @@ xfs_dir2_leafn_lookup_for_addname(
 			 * Convert the data block to the free block
 			 * holding its freespace information.
 			 */
-			newfdb = xfs_dir2_db_to_fdb(mp, newdb);
+			newfdb = dp->d_ops->db_to_fdb(mp, newdb);
 			/*
 			 * If it's not the one we have in hand, read it in.
 			 */
@@ -613,11 +615,11 @@ xfs_dir2_leafn_lookup_for_addname(
 			/*
 			 * Get the index for our entry.
 			 */
-			fi = xfs_dir2_db_to_fdindex(mp, curdb);
+			fi = dp->d_ops->db_to_fdindex(mp, curdb);
 			/*
 			 * If it has room, return it.
 			 */
-			bests = xfs_dir3_free_bests_p(mp, free);
+			bests = dp->d_ops->free_bests_p(free);
 			if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) {
 				XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
 							XFS_ERRLEVEL_LOW, mp);
@@ -1080,15 +1082,14 @@ xfs_dir3_data_block_free(
 	struct xfs_inode	*dp = args->dp;
 
 	dp->d_ops->free_hdr_from_disk(&freehdr, free);
-
-	bests = xfs_dir3_free_bests_p(tp->t_mountp, free);
+	bests = dp->d_ops->free_bests_p(free);
 	if (hdr) {
 		/*
 		 * Data block is not empty, just set the free entry to the new
 		 * value.
 		 */
 		bests[findex] = cpu_to_be16(longest);
-		xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+		xfs_dir2_free_log_bests(tp, dp, fbp, findex, findex);
 		return 0;
 	}
 
@@ -1116,7 +1117,7 @@ xfs_dir3_data_block_free(
 	}
 
 	dp->d_ops->free_hdr_to_disk(free, &freehdr);
-	xfs_dir2_free_log_header(tp, fbp);
+	xfs_dir2_free_log_header(tp, dp, fbp);
 
 	/*
 	 * If there are no useful entries left in the block, get rid of the
@@ -1140,7 +1141,7 @@ xfs_dir3_data_block_free(
 
 	/* Log the free entry that changed, unless we got rid of it.  */
 	if (logfree)
-		xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+		xfs_dir2_free_log_bests(tp, dp, fbp, findex, findex);
 	return 0;
 }
 
@@ -1243,7 +1244,7 @@ xfs_dir2_leafn_remove(
 		 * Convert the data block number to a free block,
 		 * read in the free block.
 		 */
-		fdb = xfs_dir2_db_to_fdb(mp, db);
+		fdb = dp->d_ops->db_to_fdb(mp, db);
 		error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, fdb),
 					   &fbp);
 		if (error)
@@ -1253,14 +1254,14 @@ xfs_dir2_leafn_remove(
 	{
 		struct xfs_dir3_icfree_hdr freehdr;
 		dp->d_ops->free_hdr_from_disk(&freehdr, free);
-		ASSERT(freehdr.firstdb == xfs_dir3_free_max_bests(mp) *
+		ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(mp) *
 					  (fdb - XFS_DIR2_FREE_FIRSTDB(mp)));
 	}
 #endif
 		/*
 		 * Calculate which entry we need to fix.
 		 */
-		findex = xfs_dir2_db_to_fdindex(mp, db);
+		findex = dp->d_ops->db_to_fdindex(mp, db);
 		longest = be16_to_cpu(bf[0].length);
 		/*
 		 * If the data block is now empty we can get rid of it
@@ -1688,7 +1689,7 @@ xfs_dir2_node_addname_int(
 		ifbno = fblk->blkno;
 		free = fbp->b_addr;
 		findex = fblk->index;
-		bests = xfs_dir3_free_bests_p(mp, free);
+		bests = dp->d_ops->free_bests_p(free);
 		dp->d_ops->free_hdr_from_disk(&freehdr, free);
 
 		/*
@@ -1781,7 +1782,7 @@ xfs_dir2_node_addname_int(
 		 * and the freehdr are actually initialised if they are placed
 		 * there, so we have to do it here to avoid warnings. Blech.
 		 */
-		bests = xfs_dir3_free_bests_p(mp, free);
+		bests = dp->d_ops->free_bests_p(free);
 		dp->d_ops->free_hdr_from_disk(&freehdr, free);
 		if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
 		    be16_to_cpu(bests[findex]) >= length)
@@ -1833,7 +1834,7 @@ xfs_dir2_node_addname_int(
 		 * Get the freespace block corresponding to the data block
 		 * that was just allocated.
 		 */
-		fbno = xfs_dir2_db_to_fdb(mp, dbno);
+		fbno = dp->d_ops->db_to_fdb(mp, dbno);
 		error = xfs_dir2_free_try_read(tp, dp,
 					       xfs_dir2_db_to_da(mp, fbno),
 					       &fbp);
@@ -1850,12 +1851,12 @@ xfs_dir2_node_addname_int(
 			if (error)
 				return error;
 
-			if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) {
+			if (unlikely(dp->d_ops->db_to_fdb(mp, dbno) != fbno)) {
 				xfs_alert(mp,
 			"%s: dir ino %llu needed freesp block %lld for\n"
 			"  data block %lld, got %lld ifbno %llu lastfbno %d",
 					__func__, (unsigned long long)dp->i_ino,
-					(long long)xfs_dir2_db_to_fdb(mp, dbno),
+					(long long)dp->d_ops->db_to_fdb(mp, dbno),
 					(long long)dbno, (long long)fbno,
 					(unsigned long long)ifbno, lastfbno);
 				if (fblk) {
@@ -1880,30 +1881,30 @@ xfs_dir2_node_addname_int(
 			if (error)
 				return error;
 			free = fbp->b_addr;
-			bests = xfs_dir3_free_bests_p(mp, free);
+			bests = dp->d_ops->free_bests_p(free);
 			dp->d_ops->free_hdr_from_disk(&freehdr, free);
 
 			/*
 			 * Remember the first slot as our empty slot.
 			 */
 			freehdr.firstdb = (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) *
-					xfs_dir3_free_max_bests(mp);
+					dp->d_ops->free_max_bests(mp);
 		} else {
 			free = fbp->b_addr;
-			bests = xfs_dir3_free_bests_p(mp, free);
+			bests = dp->d_ops->free_bests_p(free);
 			dp->d_ops->free_hdr_from_disk(&freehdr, free);
 		}
 
 		/*
 		 * Set the freespace block index from the data block number.
 		 */
-		findex = xfs_dir2_db_to_fdindex(mp, dbno);
+		findex = dp->d_ops->db_to_fdindex(mp, dbno);
 		/*
 		 * If it's after the end of the current entries in the
 		 * freespace block, extend that table.
 		 */
 		if (findex >= freehdr.nvalid) {
-			ASSERT(findex < xfs_dir3_free_max_bests(mp));
+			ASSERT(findex < dp->d_ops->free_max_bests(mp));
 			freehdr.nvalid = findex + 1;
 			/*
 			 * Tag new entry so nused will go up.
@@ -1917,7 +1918,7 @@ xfs_dir2_node_addname_int(
 		if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
 			freehdr.nused++;
 			dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
-			xfs_dir2_free_log_header(tp, fbp);
+			xfs_dir2_free_log_header(tp, dp, fbp);
 		}
 		/*
 		 * Update the real value in the table.
@@ -1987,7 +1988,7 @@ xfs_dir2_node_addname_int(
 	/*
 	 * If the freespace entry is now wrong, update it.
 	 */
-	bests = xfs_dir3_free_bests_p(mp, free); /* gcc is so stupid */
+	bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */
 	if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) {
 		bests[findex] = bf[0].length;
 		logfree = 1;
@@ -1996,7 +1997,7 @@ xfs_dir2_node_addname_int(
 	 * Log the freespace entry if needed.
 	 */
 	if (logfree)
-		xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+		xfs_dir2_free_log_bests(tp, dp, fbp, findex, findex);
 	/*
 	 * Return the data block and offset in args, then drop the data block.
 	 */
-- 
cgit v1.3


From 1c9a5b2e3045b91a10335adcc73267c304cb5dc0 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 30 Oct 2013 09:15:02 +1100
Subject: xfs: convert directory vector functions to constants

Many of the vectorised function calls now take no parameters and
return a constant value. There is no reason for these to be vectored
functions, so convert them to constants

Binary sizes:

   text    data     bss     dec     hex filename
 794490   96802    1096  892388   d9de4 fs/xfs/xfs.o.orig
 792986   96802    1096  890884   d9804 fs/xfs/xfs.o.p1
 792350   96802    1096  890248   d9588 fs/xfs/xfs.o.p2
 789293   96802    1096  887191   d8997 fs/xfs/xfs.o.p3
 789005   96802    1096  886903   d8997 fs/xfs/xfs.o.p4
 789061   96802    1096  886959   d88af fs/xfs/xfs.o.p5
 789733   96802    1096  887631   d8b4f fs/xfs/xfs.o.p6
 791421   96802    1096  889319   d91e7 fs/xfs/xfs.o.p7
 791701   96802    1096  889599   d92ff fs/xfs/xfs.o.p8
 791205   96802    1096  889103   d91cf fs/xfs/xfs.o.p9

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_da_btree.c     |  14 ++-
 fs/xfs/xfs_da_format.c    | 220 +++++++++++++++++-----------------------------
 fs/xfs/xfs_dir2.c         |   2 +-
 fs/xfs/xfs_dir2.h         |  14 +--
 fs/xfs/xfs_dir2_block.c   |   4 +-
 fs/xfs/xfs_dir2_data.c    |   8 +-
 fs/xfs/xfs_dir2_leaf.c    |   6 +-
 fs/xfs/xfs_dir2_node.c    |   8 +-
 fs/xfs/xfs_dir2_readdir.c |   6 +-
 fs/xfs/xfs_dir2_sf.c      |   6 +-
 10 files changed, 112 insertions(+), 176 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index a51762dae543..796272a2e129 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -333,8 +333,7 @@ xfs_da3_node_create(
 
 	dp->d_ops->node_hdr_to_disk(node, &ichdr);
 	xfs_trans_log_buf(tp, bp,
-		XFS_DA_LOGRANGE(node, &node->hdr,
-				dp->d_ops->node_hdr_size()));
+		XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
 
 	*bpp = bp;
 	return(0);
@@ -838,13 +837,12 @@ xfs_da3_node_rebalance(
 	 */
 	dp->d_ops->node_hdr_to_disk(node1, &nodehdr1);
 	xfs_trans_log_buf(tp, blk1->bp,
-		XFS_DA_LOGRANGE(node1, &node1->hdr,
-				dp->d_ops->node_hdr_size()));
+		XFS_DA_LOGRANGE(node1, &node1->hdr, dp->d_ops->node_hdr_size));
 
 	dp->d_ops->node_hdr_to_disk(node2, &nodehdr2);
 	xfs_trans_log_buf(tp, blk2->bp,
 		XFS_DA_LOGRANGE(node2, &node2->hdr,
-				dp->d_ops->node_hdr_size() +
+				dp->d_ops->node_hdr_size +
 				(sizeof(btree2[0]) * nodehdr2.count)));
 
 	/*
@@ -915,7 +913,7 @@ xfs_da3_node_add(
 	nodehdr.count += 1;
 	dp->d_ops->node_hdr_to_disk(node, &nodehdr);
 	xfs_trans_log_buf(state->args->trans, oldblk->bp,
-		XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size()));
+		XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
 
 	/*
 	 * Copy the last hash value from the oldblk to propagate upwards.
@@ -1350,7 +1348,7 @@ xfs_da3_node_remove(
 	nodehdr.count -= 1;
 	dp->d_ops->node_hdr_to_disk(node, &nodehdr);
 	xfs_trans_log_buf(state->args->trans, drop_blk->bp,
-	    XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size()));
+	    XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
 
 	/*
 	 * Copy the last hash value from the block to propagate upwards.
@@ -1423,7 +1421,7 @@ xfs_da3_node_unbalance(
 	dp->d_ops->node_hdr_to_disk(save_node, &save_hdr);
 	xfs_trans_log_buf(tp, save_blk->bp,
 		XFS_DA_LOGRANGE(save_node, &save_node->hdr,
-				dp->d_ops->node_hdr_size()));
+				dp->d_ops->node_hdr_size));
 
 	/*
 	 * Save the last hashval in the remaining block for upward propagation.
diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/xfs_da_format.c
index 01e6f9a9b804..40f18d3b7b9b 100644
--- a/fs/xfs/xfs_da_format.c
+++ b/fs/xfs/xfs_da_format.c
@@ -209,31 +209,41 @@ xfs_dir3_sfe_put_ino(
 /*
  * Directory data block operations
  */
-static int
-__xfs_dir3_data_entsize(
-	bool	ftype,
-	int	n)
-{
-	int	size = offsetof(struct xfs_dir2_data_entry, name[0]);
 
-	size += n;
-	size += sizeof(xfs_dir2_data_off_t);
-	if (ftype)
-		size += sizeof(__uint8_t);
-	return roundup(size, XFS_DIR2_DATA_ALIGN);
-}
+/*
+ * For special situations, the dirent size ends up fixed because we always know
+ * what the size of the entry is. That's true for the "." and "..", and
+ * therefore we know that they are a fixed size and hence their offsets are
+ * constant, as is the first entry.
+ *
+ * Hence, this calculation is written as a macro to be able to be calculated at
+ * compile time and so certain offsets can be calculated directly in the
+ * structure initaliser via the macro. There are two macros - one for dirents
+ * with ftype and without so there are no unresolvable conditionals in the
+ * calculations. We also use round_up() as XFS_DIR2_DATA_ALIGN is always a power
+ * of 2 and the compiler doesn't reject it (unlike roundup()).
+ */
+#define XFS_DIR2_DATA_ENTSIZE(n)					\
+	round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) +	\
+		 sizeof(xfs_dir2_data_off_t)), XFS_DIR2_DATA_ALIGN)
+
+#define XFS_DIR3_DATA_ENTSIZE(n)					\
+	round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) +	\
+		 sizeof(xfs_dir2_data_off_t) + sizeof(__uint8_t)),	\
+		XFS_DIR2_DATA_ALIGN)
 
 static int
 xfs_dir2_data_entsize(
 	int			n)
 {
-	return __xfs_dir3_data_entsize(false, n);
+	return XFS_DIR2_DATA_ENTSIZE(n);
 }
+
 static int
 xfs_dir3_data_entsize(
 	int			n)
 {
-	return __xfs_dir3_data_entsize(true, n);
+	return XFS_DIR3_DATA_ENTSIZE(n);
 }
 
 static __uint8_t
@@ -293,45 +303,6 @@ xfs_dir3_data_entry_tag_p(
 		xfs_dir3_data_entsize(dep->namelen) - sizeof(__be16));
 }
 
-/*
- * Offsets of . and .. in data space (always block 0)
- */
-static xfs_dir2_data_aoff_t
-xfs_dir2_data_dot_offset(void)
-{
-	return sizeof(struct xfs_dir2_data_hdr);
-}
-
-static xfs_dir2_data_aoff_t
-xfs_dir2_data_dotdot_offset(void)
-{
-	return xfs_dir2_data_dot_offset() + xfs_dir2_data_entsize(1);
-}
-
-static xfs_dir2_data_aoff_t
-xfs_dir2_data_first_offset(void)
-{
-	return xfs_dir2_data_dotdot_offset() + xfs_dir2_data_entsize(2);
-}
-
-static xfs_dir2_data_aoff_t
-xfs_dir3_data_dot_offset(void)
-{
-	return sizeof(struct xfs_dir3_data_hdr);
-}
-
-static xfs_dir2_data_aoff_t
-xfs_dir3_data_dotdot_offset(void)
-{
-	return xfs_dir3_data_dot_offset() + xfs_dir3_data_entsize(1);
-}
-
-static xfs_dir2_data_aoff_t
-xfs_dir3_data_first_offset(void)
-{
-	return xfs_dir3_data_dotdot_offset() + xfs_dir3_data_entsize(2);
-}
-
 /*
  * location of . and .. in data space (always block 0)
  */
@@ -340,7 +311,7 @@ xfs_dir2_data_dot_entry_p(
 	struct xfs_dir2_data_hdr *hdr)
 {
 	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + xfs_dir2_data_dot_offset());
+		((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
 }
 
 static struct xfs_dir2_data_entry *
@@ -348,7 +319,8 @@ xfs_dir2_data_dotdot_entry_p(
 	struct xfs_dir2_data_hdr *hdr)
 {
 	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + xfs_dir2_data_dotdot_offset());
+		((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR2_DATA_ENTSIZE(1));
 }
 
 static struct xfs_dir2_data_entry *
@@ -356,7 +328,9 @@ xfs_dir2_data_first_entry_p(
 	struct xfs_dir2_data_hdr *hdr)
 {
 	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + xfs_dir2_data_first_offset());
+		((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR2_DATA_ENTSIZE(1) +
+				XFS_DIR2_DATA_ENTSIZE(2));
 }
 
 static struct xfs_dir2_data_entry *
@@ -364,7 +338,7 @@ xfs_dir3_data_dot_entry_p(
 	struct xfs_dir2_data_hdr *hdr)
 {
 	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + xfs_dir3_data_dot_offset());
+		((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
 }
 
 static struct xfs_dir2_data_entry *
@@ -372,7 +346,8 @@ xfs_dir3_data_dotdot_entry_p(
 	struct xfs_dir2_data_hdr *hdr)
 {
 	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + xfs_dir3_data_dotdot_offset());
+		((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1));
 }
 
 static struct xfs_dir2_data_entry *
@@ -380,7 +355,9 @@ xfs_dir3_data_first_entry_p(
 	struct xfs_dir2_data_hdr *hdr)
 {
 	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + xfs_dir3_data_first_offset());
+		((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1) +
+				XFS_DIR3_DATA_ENTSIZE(2));
 }
 
 static struct xfs_dir2_data_free *
@@ -395,60 +372,42 @@ xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
 	return ((struct xfs_dir3_data_hdr *)hdr)->best_free;
 }
 
-static size_t
-xfs_dir2_data_entry_offset(void)
-{
-	return sizeof(struct xfs_dir2_data_hdr);
-}
-
 static struct xfs_dir2_data_entry *
 xfs_dir2_data_entry_p(struct xfs_dir2_data_hdr *hdr)
 {
 	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + xfs_dir2_data_entry_offset());
+		((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
 }
 
 static struct xfs_dir2_data_unused *
 xfs_dir2_data_unused_p(struct xfs_dir2_data_hdr *hdr)
 {
 	return (struct xfs_dir2_data_unused *)
-		((char *)hdr + xfs_dir2_data_entry_offset());
-}
-
-static size_t
-xfs_dir3_data_entry_offset(void)
-{
-	return sizeof(struct xfs_dir3_data_hdr);
+		((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
 }
 
 static struct xfs_dir2_data_entry *
 xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr)
 {
 	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + xfs_dir3_data_entry_offset());
+		((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
 }
 
 static struct xfs_dir2_data_unused *
 xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
 {
 	return (struct xfs_dir2_data_unused *)
-		((char *)hdr + xfs_dir3_data_entry_offset());
+		((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
 }
 
 
 /*
  * Directory Leaf block operations
  */
-static int
-xfs_dir2_leaf_hdr_size(void)
-{
-	return sizeof(struct xfs_dir2_leaf_hdr);
-}
-
 static int
 xfs_dir2_max_leaf_ents(struct xfs_mount *mp)
 {
-	return (mp->m_dirblksize - xfs_dir2_leaf_hdr_size()) /
+	return (mp->m_dirblksize - sizeof(struct xfs_dir2_leaf_hdr)) /
 		(uint)sizeof(struct xfs_dir2_leaf_entry);
 }
 
@@ -458,16 +417,10 @@ xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp)
 	return lp->__ents;
 }
 
-static int
-xfs_dir3_leaf_hdr_size(void)
-{
-	return sizeof(struct xfs_dir3_leaf_hdr);
-}
-
 static int
 xfs_dir3_max_leaf_ents(struct xfs_mount *mp)
 {
-	return (mp->m_dirblksize - xfs_dir3_leaf_hdr_size()) /
+	return (mp->m_dirblksize - sizeof(struct xfs_dir3_leaf_hdr)) /
 		(uint)sizeof(struct xfs_dir2_leaf_entry);
 }
 
@@ -545,25 +498,13 @@ xfs_dir3_leaf_hdr_to_disk(
 /*
  * Directory/Attribute Node block operations
  */
-static inline int
-xfs_da2_node_hdr_size(void)
-{
-	return sizeof(struct xfs_da_node_hdr);
-}
-
 static struct xfs_da_node_entry *
 xfs_da2_node_tree_p(struct xfs_da_intnode *dap)
 {
 	return dap->__btree;
 }
 
-static inline int
-xfs_da3_node_hdr_size(void)
-{
-	return sizeof(struct xfs_da3_node_hdr);
-}
-
-static inline struct xfs_da_node_entry *
+static struct xfs_da_node_entry *
 xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
 {
 	return ((struct xfs_da3_intnode *)dap)->__btree;
@@ -629,23 +570,17 @@ xfs_da3_node_hdr_to_disk(
 /*
  * Directory free space block operations
  */
-static int
-xfs_dir2_free_hdr_size(void)
-{
-	return sizeof(struct xfs_dir2_free_hdr);
-}
-
 static int
 xfs_dir2_free_max_bests(struct xfs_mount *mp)
 {
-	return (mp->m_dirblksize - xfs_dir2_free_hdr_size()) /
+	return (mp->m_dirblksize - sizeof(struct xfs_dir2_free_hdr)) /
 		sizeof(xfs_dir2_data_off_t);
 }
 
 static __be16 *
 xfs_dir2_free_bests_p(struct xfs_dir2_free *free)
 {
-	return (__be16 *)((char *)free + xfs_dir2_free_hdr_size());
+	return (__be16 *)((char *)free + sizeof(struct xfs_dir2_free_hdr));
 }
 
 /*
@@ -666,23 +601,17 @@ xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
 	return db % xfs_dir2_free_max_bests(mp);
 }
 
-static int
-xfs_dir3_free_hdr_size(void)
-{
-	return sizeof(struct xfs_dir3_free_hdr);
-}
-
 static int
 xfs_dir3_free_max_bests(struct xfs_mount *mp)
 {
-	return (mp->m_dirblksize - xfs_dir3_free_hdr_size()) /
+	return (mp->m_dirblksize - sizeof(struct xfs_dir3_free_hdr)) /
 		sizeof(xfs_dir2_data_off_t);
 }
 
 static __be16 *
 xfs_dir3_free_bests_p(struct xfs_dir2_free *free)
 {
-	return (__be16 *)((char *)free + xfs_dir3_free_hdr_size());
+	return (__be16 *)((char *)free + sizeof(struct xfs_dir3_free_hdr));
 }
 
 /*
@@ -774,10 +703,13 @@ const struct xfs_dir_ops xfs_dir2_ops = {
 	.data_entry_tag_p = xfs_dir2_data_entry_tag_p,
 	.data_bestfree_p = xfs_dir2_data_bestfree_p,
 
-	.data_dot_offset = xfs_dir2_data_dot_offset,
-	.data_dotdot_offset = xfs_dir2_data_dotdot_offset,
-	.data_first_offset = xfs_dir2_data_first_offset,
-	.data_entry_offset = xfs_dir2_data_entry_offset,
+	.data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
+	.data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR2_DATA_ENTSIZE(1),
+	.data_first_offset =  sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR2_DATA_ENTSIZE(1) +
+				XFS_DIR2_DATA_ENTSIZE(2),
+	.data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
 
 	.data_dot_entry_p = xfs_dir2_data_dot_entry_p,
 	.data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p,
@@ -785,18 +717,18 @@ const struct xfs_dir_ops xfs_dir2_ops = {
 	.data_entry_p = xfs_dir2_data_entry_p,
 	.data_unused_p = xfs_dir2_data_unused_p,
 
-	.leaf_hdr_size = xfs_dir2_leaf_hdr_size,
+	.leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
 	.leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
 	.leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
 	.leaf_max_ents = xfs_dir2_max_leaf_ents,
 	.leaf_ents_p = xfs_dir2_leaf_ents_p,
 
-	.node_hdr_size = xfs_da2_node_hdr_size,
+	.node_hdr_size = sizeof(struct xfs_da_node_hdr),
 	.node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
 	.node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
 	.node_tree_p = xfs_da2_node_tree_p,
 
-	.free_hdr_size = xfs_dir2_free_hdr_size,
+	.free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
 	.free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
 	.free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
 	.free_max_bests = xfs_dir2_free_max_bests,
@@ -821,10 +753,13 @@ const struct xfs_dir_ops xfs_dir2_ftype_ops = {
 	.data_entry_tag_p = xfs_dir3_data_entry_tag_p,
 	.data_bestfree_p = xfs_dir2_data_bestfree_p,
 
-	.data_dot_offset = xfs_dir2_data_dot_offset,
-	.data_dotdot_offset = xfs_dir2_data_dotdot_offset,
-	.data_first_offset = xfs_dir2_data_first_offset,
-	.data_entry_offset = xfs_dir2_data_entry_offset,
+	.data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
+	.data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1),
+	.data_first_offset =  sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1) +
+				XFS_DIR3_DATA_ENTSIZE(2),
+	.data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
 
 	.data_dot_entry_p = xfs_dir2_data_dot_entry_p,
 	.data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p,
@@ -832,18 +767,18 @@ const struct xfs_dir_ops xfs_dir2_ftype_ops = {
 	.data_entry_p = xfs_dir2_data_entry_p,
 	.data_unused_p = xfs_dir2_data_unused_p,
 
-	.leaf_hdr_size = xfs_dir2_leaf_hdr_size,
+	.leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
 	.leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
 	.leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
 	.leaf_max_ents = xfs_dir2_max_leaf_ents,
 	.leaf_ents_p = xfs_dir2_leaf_ents_p,
 
-	.node_hdr_size = xfs_da2_node_hdr_size,
+	.node_hdr_size = sizeof(struct xfs_da_node_hdr),
 	.node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
 	.node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
 	.node_tree_p = xfs_da2_node_tree_p,
 
-	.free_hdr_size = xfs_dir2_free_hdr_size,
+	.free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
 	.free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
 	.free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
 	.free_max_bests = xfs_dir2_free_max_bests,
@@ -868,10 +803,13 @@ const struct xfs_dir_ops xfs_dir3_ops = {
 	.data_entry_tag_p = xfs_dir3_data_entry_tag_p,
 	.data_bestfree_p = xfs_dir3_data_bestfree_p,
 
-	.data_dot_offset = xfs_dir3_data_dot_offset,
-	.data_dotdot_offset = xfs_dir3_data_dotdot_offset,
-	.data_first_offset = xfs_dir3_data_first_offset,
-	.data_entry_offset = xfs_dir3_data_entry_offset,
+	.data_dot_offset = sizeof(struct xfs_dir3_data_hdr),
+	.data_dotdot_offset = sizeof(struct xfs_dir3_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1),
+	.data_first_offset =  sizeof(struct xfs_dir3_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1) +
+				XFS_DIR3_DATA_ENTSIZE(2),
+	.data_entry_offset = sizeof(struct xfs_dir3_data_hdr),
 
 	.data_dot_entry_p = xfs_dir3_data_dot_entry_p,
 	.data_dotdot_entry_p = xfs_dir3_data_dotdot_entry_p,
@@ -879,18 +817,18 @@ const struct xfs_dir_ops xfs_dir3_ops = {
 	.data_entry_p = xfs_dir3_data_entry_p,
 	.data_unused_p = xfs_dir3_data_unused_p,
 
-	.leaf_hdr_size = xfs_dir3_leaf_hdr_size,
+	.leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr),
 	.leaf_hdr_to_disk = xfs_dir3_leaf_hdr_to_disk,
 	.leaf_hdr_from_disk = xfs_dir3_leaf_hdr_from_disk,
 	.leaf_max_ents = xfs_dir3_max_leaf_ents,
 	.leaf_ents_p = xfs_dir3_leaf_ents_p,
 
-	.node_hdr_size = xfs_da3_node_hdr_size,
+	.node_hdr_size = sizeof(struct xfs_da3_node_hdr),
 	.node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
 	.node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
 	.node_tree_p = xfs_da3_node_tree_p,
 
-	.free_hdr_size = xfs_dir3_free_hdr_size,
+	.free_hdr_size = sizeof(struct xfs_dir3_free_hdr),
 	.free_hdr_to_disk = xfs_dir3_free_hdr_to_disk,
 	.free_hdr_from_disk = xfs_dir3_free_hdr_from_disk,
 	.free_max_bests = xfs_dir3_free_max_bests,
@@ -900,14 +838,14 @@ const struct xfs_dir_ops xfs_dir3_ops = {
 };
 
 const struct xfs_dir_ops xfs_dir2_nondir_ops = {
-	.node_hdr_size = xfs_da2_node_hdr_size,
+	.node_hdr_size = sizeof(struct xfs_da_node_hdr),
 	.node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
 	.node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
 	.node_tree_p = xfs_da2_node_tree_p,
 };
 
 const struct xfs_dir_ops xfs_dir3_nondir_ops = {
-	.node_hdr_size = xfs_da3_node_hdr_size,
+	.node_hdr_size = sizeof(struct xfs_da3_node_hdr),
 	.node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
 	.node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
 	.node_tree_p = xfs_da3_node_tree_p,
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 1b44e83924b7..ce16ef02997a 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -105,7 +105,7 @@ xfs_dir_mount(
 	mp->m_dirleafblk = xfs_dir2_db_to_da(mp, XFS_DIR2_LEAF_FIRSTDB(mp));
 	mp->m_dirfreeblk = xfs_dir2_db_to_da(mp, XFS_DIR2_FREE_FIRSTDB(mp));
 
-	nodehdr_size = mp->m_dir_inode_ops->node_hdr_size();
+	nodehdr_size = mp->m_dir_inode_ops->node_hdr_size;
 	mp->m_attr_node_ents = (mp->m_sb.sb_blocksize - nodehdr_size) /
 				(uint)sizeof(xfs_da_node_entry_t);
 	mp->m_dir_node_ents = (mp->m_dirblksize - nodehdr_size) /
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 4264cb9199f8..cec70e0781ab 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -59,10 +59,10 @@ struct xfs_dir_ops {
 	struct xfs_dir2_data_free *
 		(*data_bestfree_p)(struct xfs_dir2_data_hdr *hdr);
 
-	xfs_dir2_data_aoff_t (*data_dot_offset)(void);
-	xfs_dir2_data_aoff_t (*data_dotdot_offset)(void);
-	xfs_dir2_data_aoff_t (*data_first_offset)(void);
-	size_t	(*data_entry_offset)(void);
+	xfs_dir2_data_aoff_t data_dot_offset;
+	xfs_dir2_data_aoff_t data_dotdot_offset;
+	xfs_dir2_data_aoff_t data_first_offset;
+	size_t	data_entry_offset;
 
 	struct xfs_dir2_data_entry *
 		(*data_dot_entry_p)(struct xfs_dir2_data_hdr *hdr);
@@ -75,7 +75,7 @@ struct xfs_dir_ops {
 	struct xfs_dir2_data_unused *
 		(*data_unused_p)(struct xfs_dir2_data_hdr *hdr);
 
-	int	(*leaf_hdr_size)(void);
+	int	leaf_hdr_size;
 	void	(*leaf_hdr_to_disk)(struct xfs_dir2_leaf *to,
 				    struct xfs_dir3_icleaf_hdr *from);
 	void	(*leaf_hdr_from_disk)(struct xfs_dir3_icleaf_hdr *to,
@@ -84,7 +84,7 @@ struct xfs_dir_ops {
 	struct xfs_dir2_leaf_entry *
 		(*leaf_ents_p)(struct xfs_dir2_leaf *lp);
 
-	int	(*node_hdr_size)(void);
+	int	node_hdr_size;
 	void	(*node_hdr_to_disk)(struct xfs_da_intnode *to,
 				    struct xfs_da3_icnode_hdr *from);
 	void	(*node_hdr_from_disk)(struct xfs_da3_icnode_hdr *to,
@@ -92,7 +92,7 @@ struct xfs_dir_ops {
 	struct xfs_da_node_entry *
 		(*node_tree_p)(struct xfs_da_intnode *dap);
 
-	int	(*free_hdr_size)(void);
+	int	free_hdr_size;
 	void	(*free_hdr_to_disk)(struct xfs_dir2_free *to,
 				    struct xfs_dir3_icfree_hdr *from);
 	void	(*free_hdr_from_disk)(struct xfs_dir3_icfree_hdr *to,
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index c1ff552aa7bf..90cdbf4b5f19 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -951,7 +951,7 @@ xfs_dir2_leaf_to_block(
 	while (dp->i_d.di_size > mp->m_dirblksize) {
 		int hdrsz;
 
-		hdrsz = dp->d_ops->data_entry_offset();
+		hdrsz = dp->d_ops->data_entry_offset;
 		bestsp = xfs_dir2_leaf_bests_p(ltp);
 		if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) ==
 					    mp->m_dirblksize - hdrsz) {
@@ -1185,7 +1185,7 @@ xfs_dir2_sf_to_block(
 	blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
 	blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
 				(char *)dep - (char *)hdr));
-	offset = dp->d_ops->data_first_offset();
+	offset = dp->d_ops->data_first_offset;
 	/*
 	 * Loop over existing entries, stuff them in.
 	 */
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 32d657fbe6a4..18e920c86be6 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -598,7 +598,7 @@ xfs_dir3_data_init(
 		hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
 
 	bf = dp->d_ops->data_bestfree_p(hdr);
-	bf[0].offset = cpu_to_be16(dp->d_ops->data_entry_offset());
+	bf[0].offset = cpu_to_be16(dp->d_ops->data_entry_offset);
 	for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
 		bf[i].length = 0;
 		bf[i].offset = 0;
@@ -610,7 +610,7 @@ xfs_dir3_data_init(
 	dup = dp->d_ops->data_unused_p(hdr);
 	dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
 
-	t = mp->m_dirblksize - (uint)dp->d_ops->data_entry_offset();
+	t = mp->m_dirblksize - (uint)dp->d_ops->data_entry_offset;
 	bf[0].length = cpu_to_be16(t);
 	dup->length = cpu_to_be16(t);
 	*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr);
@@ -663,7 +663,7 @@ xfs_dir2_data_log_header(
 	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
 #endif
 
-	xfs_trans_log_buf(tp, bp, 0, dp->d_ops->data_entry_offset() - 1);
+	xfs_trans_log_buf(tp, bp, 0, dp->d_ops->data_entry_offset - 1);
 }
 
 /*
@@ -742,7 +742,7 @@ xfs_dir2_data_make_free(
 	 * If this isn't the start of the block, then back up to
 	 * the previous entry and see if it's free.
 	 */
-	if (offset > dp->d_ops->data_entry_offset()) {
+	if (offset > dp->d_ops->data_entry_offset) {
 		__be16			*tagp;	/* tag just before us */
 
 		tagp = (__be16 *)((char *)hdr + offset) - 1;
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index a3c21b15cf7b..ae47ec6e16c4 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1116,7 +1116,7 @@ xfs_dir3_leaf_log_header(
 	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
 
 	xfs_trans_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf),
-			  dp->d_ops->leaf_hdr_size() - 1);
+			  dp->d_ops->leaf_hdr_size - 1);
 }
 
 /*
@@ -1424,7 +1424,7 @@ xfs_dir2_leaf_removename(
 	 * If the data block is now empty then get rid of the data block.
 	 */
 	if (be16_to_cpu(bf[0].length) ==
-			mp->m_dirblksize - dp->d_ops->data_entry_offset()) {
+			mp->m_dirblksize - dp->d_ops->data_entry_offset) {
 		ASSERT(db != mp->m_dirdatablk);
 		if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
 			/*
@@ -1623,7 +1623,7 @@ xfs_dir2_leaf_trim_data(
 	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
 	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
 	ASSERT(be16_to_cpu(bf[0].length) ==
-	       mp->m_dirblksize - dp->d_ops->data_entry_offset());
+	       mp->m_dirblksize - dp->d_ops->data_entry_offset);
 	ASSERT(db == be32_to_cpu(ltp->bestcount) - 1);
 }
 #endif
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 853798519ae9..f90bf0690346 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -274,7 +274,7 @@ xfs_dir2_free_log_header(
 	ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
 	       free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
 #endif
-	xfs_trans_log_buf(tp, bp, 0, dp->d_ops->free_hdr_size() - 1);
+	xfs_trans_log_buf(tp, bp, 0, dp->d_ops->free_hdr_size - 1);
 }
 
 /*
@@ -1268,7 +1268,7 @@ xfs_dir2_leafn_remove(
 		 * (usually).
 		 */
 		if (longest == mp->m_dirblksize -
-			       dp->d_ops->data_entry_offset()) {
+			       dp->d_ops->data_entry_offset) {
 			/*
 			 * Try to punch out the data block.
 			 */
@@ -1300,7 +1300,7 @@ xfs_dir2_leafn_remove(
 	 * Return indication of whether this leaf block is empty enough
 	 * to justify trying to join it with a neighbor.
 	 */
-	*rval = (dp->d_ops->leaf_hdr_size() +
+	*rval = (dp->d_ops->leaf_hdr_size +
 		 (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) <
 		mp->m_dir_magicpct;
 	return 0;
@@ -1409,7 +1409,7 @@ xfs_dir2_leafn_toosmall(
 	xfs_dir3_leaf_check(dp, blk->bp);
 
 	count = leafhdr.count - leafhdr.stale;
-	bytes = dp->d_ops->leaf_hdr_size() + count * sizeof(ents[0]);
+	bytes = dp->d_ops->leaf_hdr_size + count * sizeof(ents[0]);
 	if (bytes > (state->blocksize >> 1)) {
 		/*
 		 * Blk over 50%, don't try to join.
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index f45ecac821ec..c4e50c6ed584 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -119,9 +119,9 @@ xfs_dir2_sf_getdents(
 	 * mp->m_dirdatablk.
 	 */
 	dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
-						dp->d_ops->data_dot_offset());
+						dp->d_ops->data_dot_offset);
 	dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
-						dp->d_ops->data_dotdot_offset());
+						dp->d_ops->data_dotdot_offset);
 
 	/*
 	 * Put . entry unless we're starting past it.
@@ -584,7 +584,7 @@ xfs_dir2_leaf_getdents(
 			 * Skip past the header.
 			 */
 			if (byteoff == 0)
-				curoff += dp->d_ops->data_entry_offset();
+				curoff += dp->d_ops->data_entry_offset;
 			/*
 			 * Skip past entries until we reach our offset.
 			 */
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 42bbcff15cc5..aafc6e46cb58 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -473,7 +473,7 @@ xfs_dir2_sf_addname_hard(
 	 * to insert the new entry.
 	 * If it's going to end up at the end then oldsfep will point there.
 	 */
-	for (offset = dp->d_ops->data_first_offset(),
+	for (offset = dp->d_ops->data_first_offset,
 	      oldsfep = xfs_dir2_sf_firstentry(oldsfp),
 	      add_datasize = dp->d_ops->data_entsize(args->namelen),
 	      eof = (char *)oldsfep == &buf[old_isize];
@@ -556,7 +556,7 @@ xfs_dir2_sf_addname_pick(
 
 	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
 	size = dp->d_ops->data_entsize(args->namelen);
-	offset = dp->d_ops->data_first_offset();
+	offset = dp->d_ops->data_first_offset;
 	sfep = xfs_dir2_sf_firstentry(sfp);
 	holefit = 0;
 	/*
@@ -629,7 +629,7 @@ xfs_dir2_sf_check(
 	mp = dp->i_mount;
 
 	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	offset = dp->d_ops->data_first_offset();
+	offset = dp->d_ops->data_first_offset;
 	ino = dp->d_ops->sf_get_parent_ino(sfp);
 	i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
 
-- 
cgit v1.3


From b01ef655d8b9753c3d25e4c22e0a4e7c442fe5e1 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 29 Oct 2013 22:11:55 +1100
Subject: xfs: make dir2 ftype offset pointers explicit

Rather than hiding the ftype field size accounting inside the dirent
padding for the ".." and first entry offset functions for v2
directory formats, add explicit functions that calculate it
correctly.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_da_format.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/xfs_da_format.c
index 40f18d3b7b9b..870cac0b0981 100644
--- a/fs/xfs/xfs_da_format.c
+++ b/fs/xfs/xfs_da_format.c
@@ -333,6 +333,25 @@ xfs_dir2_data_first_entry_p(
 				XFS_DIR2_DATA_ENTSIZE(2));
 }
 
+static struct xfs_dir2_data_entry *
+xfs_dir2_ftype_data_dotdot_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_ftype_data_first_entry_p(
+	struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+				XFS_DIR3_DATA_ENTSIZE(1) +
+				XFS_DIR3_DATA_ENTSIZE(2));
+}
+
 static struct xfs_dir2_data_entry *
 xfs_dir3_data_dot_entry_p(
 	struct xfs_dir2_data_hdr *hdr)
@@ -762,8 +781,8 @@ const struct xfs_dir_ops xfs_dir2_ftype_ops = {
 	.data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
 
 	.data_dot_entry_p = xfs_dir2_data_dot_entry_p,
-	.data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p,
-	.data_first_entry_p = xfs_dir2_data_first_entry_p,
+	.data_dotdot_entry_p = xfs_dir2_ftype_data_dotdot_entry_p,
+	.data_first_entry_p = xfs_dir2_ftype_data_first_entry_p,
 	.data_entry_p = xfs_dir2_data_entry_p,
 	.data_unused_p = xfs_dir2_data_unused_p,
 
-- 
cgit v1.3


From a62936210525da2f46ae264e0bf9680eafac176e Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 29 Oct 2013 22:11:56 +1100
Subject: xfs: validity check the directory block leaf entry count

The directory block format verifier fails to check that the leaf
entry count is in a valid range, and so if it is corrupted then it
can lead to derefencing a pointer outside the block buffer. While we
can't exactly validate the count without first walking the directory
block, we can ensure the count lands in the valid area within the
directory block and hence avoid out-of-block references.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_dir2_data.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 18e920c86be6..70acff4ee173 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -65,7 +65,6 @@ __xfs_dir3_data_check(
 	const struct xfs_dir_ops *ops;
 
 	mp = bp->b_target->bt_mount;
-	hdr = bp->b_addr;
 
 	/*
 	 * We can be passed a null dp here from a verifier, so we need to go the
@@ -73,12 +72,25 @@ __xfs_dir3_data_check(
 	 */
 	ops = xfs_dir_get_ops(mp, dp);
 
+	hdr = bp->b_addr;
+	p = (char *)ops->data_entry_p(hdr);
+
 	switch (hdr->magic) {
 	case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
 	case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
 		btp = xfs_dir2_block_tail_p(mp, hdr);
 		lep = xfs_dir2_block_leaf_p(btp);
 		endp = (char *)lep;
+
+		/*
+		 * The number of leaf entries is limited by the size of the
+		 * block and the amount of space used by the data entries.
+		 * We don't know how much space is used by the data entries yet,
+		 * so just ensure that the count falls somewhere inside the
+		 * block right now.
+		 */
+		XFS_WANT_CORRUPTED_RETURN(be32_to_cpu(btp->count) <
+			((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry));
 		break;
 	case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
 	case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
@@ -88,13 +100,12 @@ __xfs_dir3_data_check(
 		XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
 		return EFSCORRUPTED;
 	}
-	bf = ops->data_bestfree_p(hdr);
-	p = (char *)ops->data_entry_p(hdr);
 
-	count = lastfree = freeseen = 0;
 	/*
 	 * Account for zero bestfree entries.
 	 */
+	bf = ops->data_bestfree_p(hdr);
+	count = lastfree = freeseen = 0;
 	if (!bf[0].length) {
 		XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
 		freeseen |= 1 << 0;
-- 
cgit v1.3


From 632b89e82bf1c04c251924b49adc689f7b346321 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 29 Oct 2013 22:11:58 +1100
Subject: xfs: fix static and extern sparse warnings

The kbuild test robot indicated that there were some new sparse
warnings in fs/xfs/xfs_dquot_buf.c. Actually, there were a lot more
that is wasn't warning about, so fix them all up.

Reported-by: kbuild test robot
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_alloc_btree.c  |  1 +
 fs/xfs/xfs_attr_leaf.c    |  1 +
 fs/xfs/xfs_attr_remote.c  |  1 +
 fs/xfs/xfs_bit.c          |  1 +
 fs/xfs/xfs_bmap_btree.c   |  1 +
 fs/xfs/xfs_buf_item.c     |  2 +-
 fs/xfs/xfs_da_format.c    | 11 ++++++-----
 fs/xfs/xfs_dquot_buf.c    |  3 ++-
 fs/xfs/xfs_ialloc_btree.c |  1 +
 fs/xfs/xfs_rtbitmap.c     |  1 +
 fs/xfs/xfs_sb.c           |  1 +
 fs/xfs/xfs_super.c        |  2 +-
 12 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 698587f6c60a..13085429e523 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -17,6 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index fdf9992dcbbc..7b126f46a2f9 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -18,6 +18,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index 2e5530467f2d..739e0a52deda 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -18,6 +18,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c
index 16ce44a2b43e..0e8885a59646 100644
--- a/fs/xfs/xfs_bit.c
+++ b/fs/xfs/xfs_bit.c
@@ -17,6 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_log_format.h"
+#include "xfs_bit.h"
 
 /*
  * XFS bit manipulation routines, used in non-realtime code.
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 2fb4a2202e17..706bc3f777cb 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -17,6 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index b6d20c55282b..a64f67ba25d3 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -809,7 +809,7 @@ xfs_buf_item_init(
  * Mark bytes first through last inclusive as dirty in the buf
  * item's bitmap.
  */
-void
+static void
 xfs_buf_item_log_segment(
 	struct xfs_buf_log_item	*bip,
 	uint			first,
diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/xfs_da_format.c
index 870cac0b0981..e6c83e1fbc8a 100644
--- a/fs/xfs/xfs_da_format.c
+++ b/fs/xfs/xfs_da_format.c
@@ -18,6 +18,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
@@ -706,7 +707,7 @@ xfs_dir3_free_hdr_to_disk(
 	hdr3->nused = cpu_to_be32(from->nused);
 }
 
-const struct xfs_dir_ops xfs_dir2_ops = {
+static const struct xfs_dir_ops xfs_dir2_ops = {
 	.sf_entsize = xfs_dir2_sf_entsize,
 	.sf_nextentry = xfs_dir2_sf_nextentry,
 	.sf_get_ftype = xfs_dir2_sfe_get_ftype,
@@ -756,7 +757,7 @@ const struct xfs_dir_ops xfs_dir2_ops = {
 	.db_to_fdindex = xfs_dir2_db_to_fdindex,
 };
 
-const struct xfs_dir_ops xfs_dir2_ftype_ops = {
+static const struct xfs_dir_ops xfs_dir2_ftype_ops = {
 	.sf_entsize = xfs_dir3_sf_entsize,
 	.sf_nextentry = xfs_dir3_sf_nextentry,
 	.sf_get_ftype = xfs_dir3_sfe_get_ftype,
@@ -806,7 +807,7 @@ const struct xfs_dir_ops xfs_dir2_ftype_ops = {
 	.db_to_fdindex = xfs_dir2_db_to_fdindex,
 };
 
-const struct xfs_dir_ops xfs_dir3_ops = {
+static const struct xfs_dir_ops xfs_dir3_ops = {
 	.sf_entsize = xfs_dir3_sf_entsize,
 	.sf_nextentry = xfs_dir3_sf_nextentry,
 	.sf_get_ftype = xfs_dir3_sfe_get_ftype,
@@ -856,14 +857,14 @@ const struct xfs_dir_ops xfs_dir3_ops = {
 	.db_to_fdindex = xfs_dir3_db_to_fdindex,
 };
 
-const struct xfs_dir_ops xfs_dir2_nondir_ops = {
+static const struct xfs_dir_ops xfs_dir2_nondir_ops = {
 	.node_hdr_size = sizeof(struct xfs_da_node_hdr),
 	.node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
 	.node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
 	.node_tree_p = xfs_da2_node_tree_p,
 };
 
-const struct xfs_dir_ops xfs_dir3_nondir_ops = {
+static const struct xfs_dir_ops xfs_dir3_nondir_ops = {
 	.node_hdr_size = sizeof(struct xfs_da3_node_hdr),
 	.node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
 	.node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/xfs_dquot_buf.c
index aaaf41b29869..d401457d2f25 100644
--- a/fs/xfs/xfs_dquot_buf.c
+++ b/fs/xfs/xfs_dquot_buf.c
@@ -18,6 +18,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
@@ -267,7 +268,7 @@ xfs_dquot_buf_read_verify(
  * the buffer after the update is done. This ensures that the dquot in the
  * buffer always has an up-to-date CRC value.
  */
-void
+static void
 xfs_dquot_buf_write_verify(
 	struct xfs_buf	*bp)
 {
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 1fa142dc86cb..c8fa5bbb36de 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -17,6 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
diff --git a/fs/xfs/xfs_rtbitmap.c b/fs/xfs/xfs_rtbitmap.c
index e30efe8be5e6..b1f2fe8af4a8 100644
--- a/fs/xfs/xfs_rtbitmap.c
+++ b/fs/xfs/xfs_rtbitmap.c
@@ -37,6 +37,7 @@
 #include "xfs_buf.h"
 #include "xfs_icache.h"
 #include "xfs_dinode.h"
+#include "xfs_rtalloc.h"
 
 
 /*
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
index 05b5493d2baa..13c7834b0e65 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/xfs_sb.c
@@ -17,6 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 4eb63ad87d7d..d971f4932b5d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1155,7 +1155,7 @@ xfs_restore_resvblks(struct xfs_mount *mp)
  * Note: xfs_log_quiesce() stops background log work - the callers must ensure
  * it is started again when appropriate.
  */
-void
+static void
 xfs_quiesce_attr(
 	struct xfs_mount	*mp)
 {
-- 
cgit v1.3


From ad22c7a043c2cc6792820e6c5da699935933e87d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 29 Oct 2013 22:11:57 +1100
Subject: xfs: prevent stack overflows from page cache allocation

Page cache allocation doesn't always go through ->begin_write and
hence we don't always get the opportunity to set the allocation
context to GFP_NOFS. Failing to do this means we open up the direct
relcaim stack to recurse into the filesystem and consume a
significant amount of stack.

On RHEL6.4 kernels we are seeing ra_submit() and
generic_file_splice_read() from an nfsd context recursing into the
filesystem via the inode cache shrinker and evicting inodes. This is
causing truncation to be run (e.g EOF block freeing) and causing
bmap btree block merges and free space btree block splits to occur.
These btree manipulations are occurring with the call chain already
30 functions deep and hence there is not enough stack space to
complete such operations.

To avoid these specific overruns, we need to prevent the page cache
allocation from recursing via direct reclaim. We can do that because
the allocation functions take the allocation context from that which
is stored in the mapping for the inode. We don't set that right now,
so the default is GFP_HIGHUSER_MOVABLE, which is effectively a
GFP_KERNEL context. We need it to be the equivalent of GFP_NOFS, so
when we initialise an inode, set the mapping gfp mask appropriately.

This makes the use of AOP_FLAG_NOFS redundant from other parts of
the XFS IO path, so get rid of it.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_aops.c | 3 +--
 fs/xfs/xfs_iops.c | 9 +++++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index eb310caf13b1..71c8c9d2b882 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1572,8 +1572,7 @@ xfs_vm_write_begin(
 
 	ASSERT(len <= PAGE_CACHE_SIZE);
 
-	page = grab_cache_page_write_begin(mapping, index,
-					   flags | AOP_FLAG_NOFS);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index c4cd6d47f523..27e0e544e963 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1168,6 +1168,7 @@ xfs_setup_inode(
 	struct xfs_inode	*ip)
 {
 	struct inode		*inode = &ip->i_vnode;
+	gfp_t			gfp_mask;
 
 	inode->i_ino = ip->i_ino;
 	inode->i_state = I_NEW;
@@ -1229,6 +1230,14 @@ xfs_setup_inode(
 		break;
 	}
 
+	/*
+	 * Ensure all page cache allocations are done from GFP_NOFS context to
+	 * prevent direct reclaim recursion back into the filesystem and blowing
+	 * stacks or deadlocking.
+	 */
+	gfp_mask = mapping_gfp_mask(inode->i_mapping);
+	mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS)));
+
 	/*
 	 * If there is no attribute fork no ACL can exist on this inode,
 	 * and it can't have any file capabilities attached to it either.
-- 
cgit v1.3


From 5bf1f439c89d4653f8cc8f8aa303e0d6991aba4b Mon Sep 17 00:00:00 2001
From: Denis Efremov <yefremov.denis@gmail.com>
Date: Fri, 25 Oct 2013 15:53:25 +0400
Subject: xfs:xfs_dir2_node.c: pointer use before check for null

ASSERT on args takes place after args dereference.
This assertion is redundant since we are going to panic anyway.

Found by Linux Driver Verification project (linuxtesting.org) -
PVS-Studio analyzer.

Signed-off-by: Denis Efremov <yefremov.denis@gmail.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_dir2_node.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index f90bf0690346..56369d4509d5 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -1327,7 +1327,6 @@ xfs_dir2_leafn_split(
 	args = state->args;
 	dp = args->dp;
 	mp = dp->i_mount;
-	ASSERT(args != NULL);
 	ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
 	error = xfs_da_grow_inode(args, &blkno);
 	if (error) {
-- 
cgit v1.3


From 643f7c4e5656bd18c769211f933190f7bb738245 Mon Sep 17 00:00:00 2001
From: "Geyslan G. Bem" <geyslan@gmail.com>
Date: Wed, 30 Oct 2013 16:01:00 -0500
Subject: xfs: fix possible NULL dereference in xlog_verify_iclog

In xlog_verify_iclog a debug check of the incore log buffers prints an
error if icptr is null and then goes on to dereference the pointer
regardless.  Convert this to an assert so that the intention is clear.
This was reported by Coverty.

Signed-off-by: Ben Myers <bpm@sgi.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
---
 fs/xfs/xfs_log.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index e523396753c5..49dd41e6a2dc 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3710,11 +3710,9 @@ xlog_verify_iclog(
 	/* check validity of iclog pointers */
 	spin_lock(&log->l_icloglock);
 	icptr = log->l_iclog;
-	for (i=0; i < log->l_iclog_bufs; i++) {
-		if (icptr == NULL)
-			xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
-		icptr = icptr->ic_next;
-	}
+	for (i = 0; i < log->l_iclog_bufs; i++, icptr = icptr->ic_next)
+		ASSERT(icptr);
+
 	if (icptr != log->l_iclog)
 		xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
 	spin_unlock(&log->l_icloglock);
-- 
cgit v1.3


From 10e6e65dfcedff63275c3d649d329c044caa8e26 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Mon, 9 Sep 2013 15:33:29 -0500
Subject: xfs: be more forgiving of a v4 secondary sb w/ junk in v5 fields

Today, if xfs_sb_read_verify encounters a v4 superblock
with junk past v4 fields which includes data in sb_crc,
it will be treated as a failing checksum and a significant
corruption.

There are known prior bugs which leave junk at the end
of the V4 superblock; we don't need to actually fail the
verification in this case if other checks pan out ok.

So if this is a secondary superblock, and the primary
superblock doesn't indicate that this is a V5 filesystem,
don't treat this as an actual checksum failure.

We should probably check the garbage condition as
we do in xfs_repair, and possibly warn about it
or self-heal, but that's a different scope of work.

Stable folks: This can go back to v3.10, which is what
introduced the sb CRC checking that is tripped up by old,
stale, incorrect V4 superblocks w/ unzeroed bits.

Cc: stable@vger.kernel.org
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Acked-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_sb.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
index 13c7834b0e65..b7c9aea77f8f 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/xfs_sb.c
@@ -588,6 +588,11 @@ xfs_sb_verify(
  * single bit error could clear the feature bit and unused parts of the
  * superblock are supposed to be zero. Hence a non-null crc field indicates that
  * we've potentially lost a feature bit and we should check it anyway.
+ *
+ * However, past bugs (i.e. in growfs) left non-zeroed regions beyond the
+ * last field in V4 secondary superblocks.  So for secondary superblocks,
+ * we are more forgiving, and ignore CRC failures if the primary doesn't
+ * indicate that the fs version is V5.
  */
 static void
 xfs_sb_read_verify(
@@ -608,8 +613,12 @@ xfs_sb_read_verify(
 
 		if (!xfs_verify_cksum(bp->b_addr, be16_to_cpu(dsb->sb_sectsize),
 				      offsetof(struct xfs_sb, sb_crc))) {
-			error = EFSCORRUPTED;
-			goto out_error;
+			/* Only fail bad secondaries on a known V5 filesystem */
+			if (bp->b_bn != XFS_SB_DADDR &&
+			    xfs_sb_version_hascrc(&mp->m_sb)) {
+				error = EFSCORRUPTED;
+				goto out_error;
+			}
 		}
 	}
 	error = xfs_sb_verify(bp, true);
-- 
cgit v1.3


From 8217d146ab98a1790349d79c436176658e311e3c Mon Sep 17 00:00:00 2001
From: Anna Schumaker <bjschuma@netapp.com>
Date: Wed, 30 Oct 2013 13:38:13 -0400
Subject: NFSD: Add support for NFS v4.2 operation checking

The server does allow NFS over v4.2, even if it doesn't add any new
operations yet.

I also switch to using constants to represent the last operation for
each minor version since this makes the code cleaner and easier to
understand at a quick glance.

Signed-off-by: Anna Schumaker <bjschuma@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c    | 8 +++++---
 include/linux/nfs4.h | 3 +++
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 99bebea20668..83db5a742aa9 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1585,11 +1585,13 @@ static nfsd4_dec nfsd4_dec_ops[] = {
 static inline bool
 nfsd4_opnum_in_range(struct nfsd4_compoundargs *argp, struct nfsd4_op *op)
 {
-	if (op->opnum < FIRST_NFS4_OP || op->opnum > LAST_NFS4_OP)
+	if (op->opnum < FIRST_NFS4_OP)
 		return false;
-	else if (argp->minorversion == 0 && op->opnum > OP_RELEASE_LOCKOWNER)
+	else if (argp->minorversion == 0 && op->opnum > LAST_NFS40_OP)
 		return false;
-	else if (argp->minorversion == 1 && op->opnum > OP_RECLAIM_COMPLETE)
+	else if (argp->minorversion == 1 && op->opnum > LAST_NFS41_OP)
+		return false;
+	else if (argp->minorversion == 2 && op->opnum > LAST_NFS42_OP)
 		return false;
 	return true;
 }
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index e36dee52f224..737e40e4c554 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -118,6 +118,9 @@ Needs to be updated if more operations are defined in future.*/
 
 #define FIRST_NFS4_OP	OP_ACCESS
 #define LAST_NFS4_OP 	OP_RECLAIM_COMPLETE
+#define LAST_NFS40_OP	OP_RELEASE_LOCKOWNER
+#define LAST_NFS41_OP	OP_RECLAIM_COMPLETE
+#define LAST_NFS42_OP	OP_RECLAIM_COMPLETE
 
 enum nfsstat4 {
 	NFS4_OK = 0,
-- 
cgit v1.3


From cfe58f9dcd9afe181894e2257e3f8aa3fee840c5 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Thu, 31 Oct 2013 14:57:01 +0900
Subject: f2fs: avoid to wait all the node blocks during fsync

Previously, f2fs_sync_file() waits for all the node blocks to be written.
But, we don't need to do that, but wait only the inode-related node blocks.

This patch adds wait_on_node_pages_writeback() in which waits inode-related
node blocks that are on writeback.

Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/f2fs.h |  1 +
 fs/f2fs/file.c |  5 +++--
 fs/f2fs/node.c | 40 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 6aaefdbaaac0..625eb4befad4 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1026,6 +1026,7 @@ void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
 int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
 int truncate_inode_blocks(struct inode *, pgoff_t);
 int truncate_xattr_node(struct inode *, struct page *);
+int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t);
 int remove_inode_page(struct inode *);
 struct page *new_inode_page(struct inode *, const struct qstr *);
 struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 58ed19aaa4c6..7d714f4972d5 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -189,8 +189,9 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 			if (ret)
 				goto out;
 		}
-		filemap_fdatawait_range(sbi->node_inode->i_mapping,
-							0, LONG_MAX);
+		ret = wait_on_node_pages_writeback(sbi, inode->i_ino);
+		if (ret)
+			goto out;
 		ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 	}
 out:
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 8e331d5b9fcd..b527ed4c0225 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1148,6 +1148,46 @@ continue_unlock:
 	return nwritten;
 }
 
+int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
+{
+	struct address_space *mapping = sbi->node_inode->i_mapping;
+	pgoff_t index = 0, end = LONG_MAX;
+	struct pagevec pvec;
+	int nr_pages;
+	int ret2 = 0, ret = 0;
+
+	pagevec_init(&pvec, 0);
+	while ((index <= end) &&
+			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+			PAGECACHE_TAG_WRITEBACK,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
+		unsigned i;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/* until radix tree lookup accepts end_index */
+			if (page->index > end)
+				continue;
+
+			if (ino && ino_of_node(page) == ino)
+				wait_on_page_writeback(page);
+			if (TestClearPageError(page))
+				ret = -EIO;
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+		ret2 = -ENOSPC;
+	if (test_and_clear_bit(AS_EIO, &mapping->flags))
+		ret2 = -EIO;
+	if (!ret)
+		ret = ret2;
+	return ret;
+}
+
 static int f2fs_write_node_page(struct page *page,
 				struct writeback_control *wbc)
 {
-- 
cgit v1.3


From 2746f7a17062d3526116f7ae7f91d88b19c2464e Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 31 Oct 2013 11:46:31 -0400
Subject: ext4: don't count free clusters from a corrupt block group

A bg that's been flagged "corrupt" by definition has no free blocks,
so that the allocator won't be tempted to use the damaged bg.
Therefore, we shouldn't count the clusters in the damaged group when
calculating free counts.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
---
 fs/ext4/balloc.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index dc5d572ebd6a..6ea7b1436bbc 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -640,6 +640,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
 	struct ext4_group_desc *gdp;
 	ext4_group_t i;
 	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	struct ext4_group_info *grp;
 #ifdef EXT4FS_DEBUG
 	struct ext4_super_block *es;
 	ext4_fsblk_t bitmap_count;
@@ -655,7 +656,11 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
-		desc_count += ext4_free_group_clusters(sb, gdp);
+		grp = NULL;
+		if (EXT4_SB(sb)->s_group_info)
+			grp = ext4_get_group_info(sb, i);
+		if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+			desc_count += ext4_free_group_clusters(sb, gdp);
 		brelse(bitmap_bh);
 		bitmap_bh = ext4_read_block_bitmap(sb, i);
 		if (bitmap_bh == NULL)
@@ -679,7 +684,11 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
-		desc_count += ext4_free_group_clusters(sb, gdp);
+		grp = NULL;
+		if (EXT4_SB(sb)->s_group_info)
+			grp = ext4_get_group_info(sb, i);
+		if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+			desc_count += ext4_free_group_clusters(sb, gdp);
 	}
 
 	return desc_count;
-- 
cgit v1.3


From 1acd1c301f4faae80f4d2c7bbd9a4553b131c0e3 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 31 Oct 2013 13:03:04 -0400
Subject: nfs: fix inverted test for delegation in nfs4_reclaim_open_state

commit 6686390bab6a0e0 (NFS: remove incorrect "Lock reclaim failed!"
warning.) added a test for a delegation before checking to see if any
reclaimed locks failed. The test however is backward and is only doing
that check when a delegation is held instead of when one isn't.

Cc: NeilBrown <neilb@suse.de>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Fixes: 6686390bab6a: NFS: remove incorrect "Lock reclaim failed!" warning.
Cc: stable@vger.kernel.org # 3.12
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 452f4c8dadea..c8e729deb4f7 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1476,7 +1476,7 @@ restart:
 		if (status >= 0) {
 			status = nfs4_reclaim_locks(state, ops);
 			if (status >= 0) {
-				if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) {
+				if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) {
 					spin_lock(&state->state_lock);
 					list_for_each_entry(lock, &state->lock_states, ls_locks) {
 						if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
-- 
cgit v1.3


From 1ce0aa802c4cbe6dfe2c7fa95c839d75a509c9a5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 31 Oct 2013 20:37:15 +0100
Subject: jbd: Revert "jbd: remove dependency on __GFP_NOFAIL"

This reverts commit 05713082ab7690a2b22b044cfc867f346c39cd2d. The idea
to remove __GFP_NOFAIL was opposed by Andrew Morton. Although mm guys do
want to get rid of __GFP_NOFAIL users, opencoding the allocation retry
is even worse.

See emails following
http://www.gossamer-threads.com/lists/linux/kernel/1809153#1809153

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/transaction.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index be0c39b66fe0..aa603e017d22 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -26,7 +26,6 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/hrtimer.h>
-#include <linux/backing-dev.h>
 
 static void __journal_temp_unlink_buffer(struct journal_head *jh);
 
@@ -100,10 +99,11 @@ static int start_this_handle(journal_t *journal, handle_t *handle)
 
 alloc_transaction:
 	if (!journal->j_running_transaction) {
-		new_transaction = kzalloc(sizeof(*new_transaction), GFP_NOFS);
+		new_transaction = kzalloc(sizeof(*new_transaction),
+						GFP_NOFS|__GFP_NOFAIL);
 		if (!new_transaction) {
-			congestion_wait(BLK_RW_ASYNC, HZ/50);
-			goto alloc_transaction;
+			ret = -ENOMEM;
+			goto out;
 		}
 	}
 
-- 
cgit v1.3


From bb86d21cba22a045b09d11b71decf5ca7c3d5def Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Fri, 25 Oct 2013 14:52:44 +0800
Subject: xfs: fix the extent count when allocating an new indirection array
 entry

At xfs_iext_add(), if extent(s) are being appended to the last page in
the indirection array and the new extent(s) don't fit in the page, the
number of extents(erp->er_extcount) in a new allocated entry should be
the minimum value between count and XFS_LINEAR_EXTS, instead of count.

For now, there is no existing test case can demonstrates a problem with
the er_extcount being set incorrectly here, but it obviously like a bug.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_inode_fork.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c
index 22c9837c5d4b..cfee14a83cfe 100644
--- a/fs/xfs/xfs_inode_fork.c
+++ b/fs/xfs/xfs_inode_fork.c
@@ -1021,15 +1021,14 @@ xfs_iext_add(
 		 * the next index needed in the indirection array.
 		 */
 		else {
-			int	count = ext_diff;
+			uint	count = ext_diff;
 
 			while (count) {
 				erp = xfs_iext_irec_new(ifp, erp_idx);
-				erp->er_extcount = count;
-				count -= MIN(count, (int)XFS_LINEAR_EXTS);
-				if (count) {
+				erp->er_extcount = min(count, XFS_LINEAR_EXTS);
+				count -= erp->er_extcount;
+				if (count)
 					erp_idx++;
-				}
 			}
 		}
 	}
-- 
cgit v1.3


From dcb9917ba041866686fe152850364826c4622a36 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 31 Oct 2013 23:00:24 -0400
Subject: ext4: avoid bh leak in retry path of ext4_expand_extra_isize_ea()

Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/xattr.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 03e9bebba198..1423c4816a47 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1352,6 +1352,7 @@ retry:
 					new_extra_isize = s_min_extra_isize;
 					kfree(is); is = NULL;
 					kfree(bs); bs = NULL;
+					brelse(bh);
 					goto retry;
 				}
 				error = -1;
-- 
cgit v1.3


From 12207f69b3ef4d6eea6a5568281e49f671977ab1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 1 Nov 2013 10:49:32 -0400
Subject: nfs: fix oops when trying to set SELinux label

Chao reported the following oops when testing labeled NFS:

BUG: unable to handle kernel NULL pointer dereference at           (null)
IP: [<ffffffffa0568703>] nfs4_xdr_enc_setattr+0x43/0x110 [nfsv4]
PGD 277bbd067 PUD 2777ea067 PMD 0
Oops: 0000 [#1] SMP
Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache sg coretemp kvm_intel kvm crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel lrw gf128mul iTCO_wdt glue_helper ablk_helper cryptd iTCO_vendor_support bnx2 pcspkr serio_raw i7core_edac cdc_ether microcode usbnet edac_core mii lpc_ich i2c_i801 mfd_core shpchp ioatdma dca acpi_cpufreq mperf nfsd auth_rpcgss nfs_acl lockd sunrpc xfs libcrc32c sr_mod sd_mod cdrom crc_t10dif mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper ata_generic ttm pata_acpi drm ata_piix libata megaraid_sas i2c_core dm_mirror dm_region_hash dm_log dm_mod
CPU: 4 PID: 25657 Comm: chcon Not tainted 3.10.0-33.el7.x86_64 #1
Hardware name: IBM System x3550 M3 -[7944OEJ]-/90Y4784     , BIOS -[D6E150CUS-1.11]- 02/08/2011
task: ffff880178397220 ti: ffff8801595d2000 task.ti: ffff8801595d2000
RIP: 0010:[<ffffffffa0568703>]  [<ffffffffa0568703>] nfs4_xdr_enc_setattr+0x43/0x110 [nfsv4]
RSP: 0018:ffff8801595d3888  EFLAGS: 00010296
RAX: 0000000000000000 RBX: ffff8801595d3b30 RCX: 0000000000000b4c
RDX: ffff8801595d3b30 RSI: ffff8801595d38e0 RDI: ffff880278b6ec00
RBP: ffff8801595d38c8 R08: ffff8801595d3b30 R09: 0000000000000001
R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801595d38e0
R13: ffff880277a4a780 R14: ffffffffa05686c0 R15: ffff8802765f206c
FS:  00007f2c68486800(0000) GS:ffff88027fc00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 000000027651a000 CR4: 00000000000007e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Stack:
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 ffff880277865800 ffff880278b6ec00 ffff880277a4a780
 ffff8801595d3948 ffffffffa02ad926 ffff8801595d3b30 ffff8802765f206c
Call Trace:
 [<ffffffffa02ad926>] rpcauth_wrap_req+0x86/0xd0 [sunrpc]
 [<ffffffffa02a1d40>] ? call_connect+0xb0/0xb0 [sunrpc]
 [<ffffffffa02a1d40>] ? call_connect+0xb0/0xb0 [sunrpc]
 [<ffffffffa02a1ecb>] call_transmit+0x18b/0x290 [sunrpc]
 [<ffffffffa02a1d40>] ? call_connect+0xb0/0xb0 [sunrpc]
 [<ffffffffa02aae14>] __rpc_execute+0x84/0x400 [sunrpc]
 [<ffffffffa02ac40e>] rpc_execute+0x5e/0xa0 [sunrpc]
 [<ffffffffa02a2ea0>] rpc_run_task+0x70/0x90 [sunrpc]
 [<ffffffffa02a2f03>] rpc_call_sync+0x43/0xa0 [sunrpc]
 [<ffffffffa055284d>] _nfs4_do_set_security_label+0x11d/0x170 [nfsv4]
 [<ffffffffa0558861>] nfs4_set_security_label.isra.69+0xf1/0x1d0 [nfsv4]
 [<ffffffff815fca8b>] ? avc_alloc_node+0x24/0x125
 [<ffffffff815fcd2f>] ? avc_compute_av+0x1a3/0x1b5
 [<ffffffffa055897b>] nfs4_xattr_set_nfs4_label+0x3b/0x50 [nfsv4]
 [<ffffffff811bc772>] generic_setxattr+0x62/0x80
 [<ffffffff811bcfc3>] __vfs_setxattr_noperm+0x63/0x1b0
 [<ffffffff811bd1c5>] vfs_setxattr+0xb5/0xc0
 [<ffffffff811bd2fe>] setxattr+0x12e/0x1c0
 [<ffffffff811a4d22>] ? final_putname+0x22/0x50
 [<ffffffff811a4f2b>] ? putname+0x2b/0x40
 [<ffffffff811aa1cf>] ? user_path_at_empty+0x5f/0x90
 [<ffffffff8119bc29>] ? __sb_start_write+0x49/0x100
 [<ffffffff811bd66f>] SyS_lsetxattr+0x8f/0xd0
 [<ffffffff8160cf99>] system_call_fastpath+0x16/0x1b
Code: 48 8b 02 48 c7 45 c0 00 00 00 00 48 c7 45 c8 00 00 00 00 48 c7 45 d0 00 00 00 00 48 c7 45 d8 00 00 00 00 48 c7 45 e0 00 00 00 00 <48> 8b 00 48 8b 00 48 85 c0 0f 84 ae 00 00 00 48 8b 80 b8 03 00
RIP  [<ffffffffa0568703>] nfs4_xdr_enc_setattr+0x43/0x110 [nfsv4]
 RSP <ffff8801595d3888>
CR2: 0000000000000000

The problem is that _nfs4_do_set_security_label calls rpc_call_sync()
directly which fails to do any setup of the SEQUENCE call. Have it use
nfs4_call_sync() instead which does the right thing. While we're at it
change the name of "args" to "arg" to better match the pattern in
_nfs4_do_setattr.

Reported-by: Chao Ye <cye@redhat.com>
Cc: David Quigley <dpquigl@davequigley.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Cc: stable@vger.kernel.org # 3.11+
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b02c4cc7b0a9..d436c576e988 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4657,7 +4657,7 @@ static int _nfs4_do_set_security_label(struct inode *inode,
 	struct iattr sattr = {0};
 	struct nfs_server *server = NFS_SERVER(inode);
 	const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
-	struct nfs_setattrargs args = {
+	struct nfs_setattrargs arg = {
 		.fh             = NFS_FH(inode),
 		.iap            = &sattr,
 		.server		= server,
@@ -4671,14 +4671,14 @@ static int _nfs4_do_set_security_label(struct inode *inode,
 	};
 	struct rpc_message msg = {
 		.rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
-		.rpc_argp       = &args,
+		.rpc_argp       = &arg,
 		.rpc_resp       = &res,
 	};
 	int status;
 
-	nfs4_stateid_copy(&args.stateid, &zero_stateid);
+	nfs4_stateid_copy(&arg.stateid, &zero_stateid);
 
-	status = rpc_call_sync(server->client, &msg, 0);
+	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
 	if (status)
 		dprintk("%s failed: %d\n", __func__, status);
 
-- 
cgit v1.3


From fcb63a9bd8427fc584229048ea14f1453dc9a2e1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 1 Nov 2013 12:42:25 -0400
Subject: NFS: Fix a missing initialisation when reading the SELinux label

Ensure that _nfs4_do_get_security_label() also initialises the
SEQUENCE call correctly, by having it call into nfs4_call_sync().

Reported-by: Jeff Layton <jlayton@redhat.com>
Cc: stable@vger.kernel.org # 3.11+
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d436c576e988..7e28b5c77cbc 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4602,7 +4602,7 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf,
 	struct nfs4_label label = {0, 0, buflen, buf};
 
 	u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
-	struct nfs4_getattr_arg args = {
+	struct nfs4_getattr_arg arg = {
 		.fh		= NFS_FH(inode),
 		.bitmask	= bitmask,
 	};
@@ -4613,14 +4613,14 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf,
 	};
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_GETATTR],
-		.rpc_argp	= &args,
+		.rpc_argp	= &arg,
 		.rpc_resp	= &res,
 	};
 	int ret;
 
 	nfs_fattr_init(&fattr);
 
-	ret = rpc_call_sync(server->client, &msg, 0);
+	ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0);
 	if (ret)
 		return ret;
 	if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL))
-- 
cgit v1.3


From 3378b7f40d79930f0f447a164c7e8fcbe4480e40 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 1 Nov 2013 10:42:15 -0400
Subject: nfsd4: fix discarded security labels on setattr

Security labels in setattr calls are currently ignored because we forget
to set label->len.

Cc: stable@vger.kernel.org
Reported-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 83db5a742aa9..d9d7fa94967f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -411,6 +411,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 		label->data = kzalloc(dummy32 + 1, GFP_KERNEL);
 		if (!label->data)
 			return nfserr_jukebox;
+		label->len = dummy32;
 		defer_free(argp, kfree, label->data);
 		memcpy(label->data, buf, dummy32);
 	}
-- 
cgit v1.3


From 044e3bc33391b1f2769d5ab2c04f246c3d8e04c3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 1 Nov 2013 13:16:53 -0400
Subject: sysfs: use generic_file_llseek() for sysfs_file_operations

13c589d5b0ac6 ("sysfs: use seq_file when reading regular files")
converted regular sysfs files to use seq_file.  The commit substituted
generic_file_llseek() with seq_lseek() for llseek implementation.

Before the change, all regular sysfs files were allowed to seek to any
position in [0, PAGE_SIZE] as the file size is always PAGE_SIZE and
generic_file_llseek() allows any seeking inside the range under file
size; however, seq_lseek()'s behavior is different.  It traverses the
output by repeatedly invoking ->show() until it reaches the target
offset or traversal indicates EOF.  As seq_files are fully dynamic and
may not end at all, it doesn't support seeking from the end
(SEEK_END).

Apparently, there are userland tools which uses SEEK_END to discover
the buffer size to use and the switch to seq_lseek() disturbs them as
SEEK_END fails with -EINVAL.

The only benefits of using seq_lseek() instead of
generic_file_llseek() are

* Early failure.  If traversing to certain file position should fail,
  seq_lseek() will report such failures on lseek(2) instead of the
  following read/write operations.

* EOF detection.  While SEEK_END is not supported, SEEK_SET/CUR +
  large offset can be used to detect eof - eof at the time of the seek
  anyway as the file size may change dynamically.

Both aren't necessary for sysfs or prospect kernfs users.  Revert to
genefic_file_llseek() and preserve the original behavior.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Link: https://lkml.kernel.org/r/20131031114358.GA5551@osiris
Tested-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 382db3c045f3..79b5da2acbe1 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -800,7 +800,7 @@ EXPORT_SYMBOL_GPL(sysfs_notify);
 const struct file_operations sysfs_file_operations = {
 	.read		= seq_read,
 	.write		= sysfs_write_file,
-	.llseek		= seq_lseek,
+	.llseek		= generic_file_llseek,
 	.open		= sysfs_open_file,
 	.release	= sysfs_release,
 	.poll		= sysfs_poll,
-- 
cgit v1.3


From 0cae60f91494e34a0c5391f1455f825d5849b05f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 30 Oct 2013 10:28:36 -0400
Subject: sysfs: rename sysfs_assoc_lock and explain what it's about

sysfs_assoc_lock is an odd piece of locking.  In general, whoever owns
a kobject is responsible for synchronizing sysfs operations and sysfs
proper assumes that, for example, removal won't race with any other
operation; however, this doesn't work for symlinking because an entity
performing symlink doesn't usually own the target kobject and thus has
no control over its removal.

sysfs_assoc_lock synchronizes symlink operations against kobj->sd
disassociation so that symlink code doesn't end up dereferencing
already freed sysfs_dirent by racing with removal of the target
kobject.

This is quite obscure and the generic name of the lock and lack of
comments make it difficult to understand its role.  Let's rename it to
sysfs_symlink_target_lock and add comments explaining what's going on.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c     | 18 +++++++++++++++---
 fs/sysfs/symlink.c | 20 ++++++++++++++------
 fs/sysfs/sysfs.h   |  2 +-
 3 files changed, 30 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index de47ed32d5c7..08c66969d52a 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -26,7 +26,7 @@
 #include "sysfs.h"
 
 DEFINE_MUTEX(sysfs_mutex);
-DEFINE_SPINLOCK(sysfs_assoc_lock);
+DEFINE_SPINLOCK(sysfs_symlink_target_lock);
 
 #define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb)
 
@@ -902,9 +902,21 @@ void sysfs_remove_dir(struct kobject *kobj)
 {
 	struct sysfs_dirent *sd = kobj->sd;
 
-	spin_lock(&sysfs_assoc_lock);
+	/*
+	 * In general, kboject owner is responsible for ensuring removal
+	 * doesn't race with other operations and sysfs doesn't provide any
+	 * protection; however, when @kobj is used as a symlink target, the
+	 * symlinking entity usually doesn't own @kobj and thus has no
+	 * control over removal.  @kobj->sd may be removed anytime and
+	 * symlink code may end up dereferencing an already freed sd.
+	 *
+	 * sysfs_symlink_target_lock synchronizes @kobj->sd disassociation
+	 * against symlink operations so that symlink code can safely
+	 * dereference @kobj->sd.
+	 */
+	spin_lock(&sysfs_symlink_target_lock);
 	kobj->sd = NULL;
-	spin_unlock(&sysfs_assoc_lock);
+	spin_unlock(&sysfs_symlink_target_lock);
 
 	if (sd) {
 		WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR);
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 22ea2f5796f5..1a23681b8179 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -32,13 +32,15 @@ static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
 
 	BUG_ON(!name || !parent_sd);
 
-	/* target->sd can go away beneath us but is protected with
-	 * sysfs_assoc_lock.  Fetch target_sd from it.
+	/*
+	 * We don't own @target and it may be removed at any time.
+	 * Synchronize using sysfs_symlink_target_lock.  See
+	 * sysfs_remove_dir() for details.
 	 */
-	spin_lock(&sysfs_assoc_lock);
+	spin_lock(&sysfs_symlink_target_lock);
 	if (target->sd)
 		target_sd = sysfs_get(target->sd);
-	spin_unlock(&sysfs_assoc_lock);
+	spin_unlock(&sysfs_symlink_target_lock);
 
 	error = -ENOENT;
 	if (!target_sd)
@@ -140,10 +142,16 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
 			const char *name)
 {
 	const void *ns = NULL;
-	spin_lock(&sysfs_assoc_lock);
+
+	/*
+	 * We don't own @target and it may be removed at any time.
+	 * Synchronize using sysfs_symlink_target_lock.  See
+	 * sysfs_remove_dir() for details.
+	 */
+	spin_lock(&sysfs_symlink_target_lock);
 	if (targ->sd)
 		ns = targ->sd->s_ns;
-	spin_unlock(&sysfs_assoc_lock);
+	spin_unlock(&sysfs_symlink_target_lock);
 	sysfs_hash_and_remove(kobj->sd, name, ns);
 }
 
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 05d063fe69c7..e3aea92ebfa3 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -159,7 +159,7 @@ extern struct kmem_cache *sysfs_dir_cachep;
  * dir.c
  */
 extern struct mutex sysfs_mutex;
-extern spinlock_t sysfs_assoc_lock;
+extern spinlock_t sysfs_symlink_target_lock;
 extern const struct dentry_operations sysfs_dentry_ops;
 
 extern const struct file_operations sysfs_dir_operations;
-- 
cgit v1.3


From 3d378d3fd82a759d59c60d89b4559bf325d7e668 Mon Sep 17 00:00:00 2001
From: Tim Gardner <tim.gardner@canonical.com>
Date: Sat, 2 Nov 2013 12:50:34 -0500
Subject: cifs: Make big endian multiplex ID sequences monotonic on the wire

The multiplex identifier (MID) in the SMB header is only
ever used by the client, in conjunction with PID, to match responses
from the server. As such, the endianess of the MID is not important.
However, When tracing packet sequences on the wire, protocol analyzers
such as wireshark display MID as little endian. It is much more informative
for the on-the-wire MID sequences to match debug information emitted by the
CIFS driver.  Therefore, one should write and read MID in the SMB header
assuming it is always little endian.

Observed from wireshark during the protocol negotiation
and session setup:

        Multiplex ID: 256
        Multiplex ID: 256
        Multiplex ID: 512
        Multiplex ID: 512
        Multiplex ID: 768
        Multiplex ID: 768

After this patch on-the-wire MID values begin at 1 and increase monotonically.

Introduce get_next_mid64() for the internal consumers that use the full 64 bit
multiplex identifier.

Introduce the helpers get_mid() and compare_mid() to make the endian
translation clear.

Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Tim Gardner <timg@tpi.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsglob.h      | 25 ++++++++++++++++++++++++-
 fs/cifs/cifspdu.h       |  2 +-
 fs/cifs/misc.c          | 10 ++++++----
 fs/cifs/smb1ops.c       |  4 ++--
 fs/cifs/smb2transport.c |  2 +-
 fs/cifs/transport.c     |  2 +-
 6 files changed, 35 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 06e8947fc370..670da1e55be7 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -622,11 +622,34 @@ set_credits(struct TCP_Server_Info *server, const int val)
 }
 
 static inline __u64
-get_next_mid(struct TCP_Server_Info *server)
+get_next_mid64(struct TCP_Server_Info *server)
 {
 	return server->ops->get_next_mid(server);
 }
 
+static inline __le16
+get_next_mid(struct TCP_Server_Info *server)
+{
+	__u16 mid = get_next_mid64(server);
+	/*
+	 * The value in the SMB header should be little endian for easy
+	 * on-the-wire decoding.
+	 */
+	return cpu_to_le16(mid);
+}
+
+static inline __u16
+get_mid(const struct smb_hdr *smb)
+{
+	return le16_to_cpu(smb->Mid);
+}
+
+static inline bool
+compare_mid(__u16 mid, const struct smb_hdr *smb)
+{
+	return mid == le16_to_cpu(smb->Mid);
+}
+
 /*
  * When the server supports very large reads and writes via POSIX extensions,
  * we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index d40bd77a2392..f80f98f5ef9c 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -428,7 +428,7 @@ struct smb_hdr {
 	__u16 Tid;
 	__le16 Pid;
 	__u16 Uid;
-	__u16 Mid;
+	__le16 Mid;
 	__u8 WordCount;
 } __attribute__((packed));
 
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 298e31e3bdc6..2f9f3790679d 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -295,7 +295,8 @@ check_smb_hdr(struct smb_hdr *smb)
 	if (smb->Command == SMB_COM_LOCKING_ANDX)
 		return 0;
 
-	cifs_dbg(VFS, "Server sent request, not response. mid=%u\n", smb->Mid);
+	cifs_dbg(VFS, "Server sent request, not response. mid=%u\n",
+		 get_mid(smb));
 	return 1;
 }
 
@@ -351,6 +352,7 @@ checkSMB(char *buf, unsigned int total_read)
 	}
 
 	if (4 + rfclen != clc_len) {
+		__u16 mid = get_mid(smb);
 		/* check if bcc wrapped around for large read responses */
 		if ((rfclen > 64 * 1024) && (rfclen > clc_len)) {
 			/* check if lengths match mod 64K */
@@ -358,11 +360,11 @@ checkSMB(char *buf, unsigned int total_read)
 				return 0; /* bcc wrapped */
 		}
 		cifs_dbg(FYI, "Calculated size %u vs length %u mismatch for mid=%u\n",
-			 clc_len, 4 + rfclen, smb->Mid);
+			 clc_len, 4 + rfclen, mid);
 
 		if (4 + rfclen < clc_len) {
 			cifs_dbg(VFS, "RFC1001 size %u smaller than SMB for mid=%u\n",
-				 rfclen, smb->Mid);
+				 rfclen, mid);
 			return -EIO;
 		} else if (rfclen > clc_len + 512) {
 			/*
@@ -375,7 +377,7 @@ checkSMB(char *buf, unsigned int total_read)
 			 * data to 512 bytes.
 			 */
 			cifs_dbg(VFS, "RFC1001 size %u more than 512 bytes larger than SMB for mid=%u\n",
-				 rfclen, smb->Mid);
+				 rfclen, mid);
 			return -EIO;
 		}
 	}
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 8233b174de3d..09ef8f322c2a 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -67,7 +67,7 @@ send_nt_cancel(struct TCP_Server_Info *server, void *buf,
 	mutex_unlock(&server->srv_mutex);
 
 	cifs_dbg(FYI, "issued NT_CANCEL for mid %u, rc = %d\n",
-		 in_buf->Mid, rc);
+		 get_mid(in_buf), rc);
 
 	return rc;
 }
@@ -101,7 +101,7 @@ cifs_find_mid(struct TCP_Server_Info *server, char *buffer)
 
 	spin_lock(&GlobalMid_Lock);
 	list_for_each_entry(mid, &server->pending_mid_q, qhead) {
-		if (mid->mid == buf->Mid &&
+		if (compare_mid(mid->mid, buf) &&
 		    mid->mid_state == MID_REQUEST_SUBMITTED &&
 		    le16_to_cpu(mid->command) == buf->Command) {
 			spin_unlock(&GlobalMid_Lock);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 340abca3aa52..c523617eade2 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -466,7 +466,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 static inline void
 smb2_seq_num_into_buf(struct TCP_Server_Info *server, struct smb2_hdr *hdr)
 {
-	hdr->MessageId = get_next_mid(server);
+	hdr->MessageId = get_next_mid64(server);
 }
 
 static struct mid_q_entry *
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 800b938e4061..0ee6d249ef6f 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -58,7 +58,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 		return temp;
 	else {
 		memset(temp, 0, sizeof(struct mid_q_entry));
-		temp->mid = smb_buffer->Mid;	/* always LE */
+		temp->mid = get_mid(smb_buffer);
 		temp->pid = current->pid;
 		temp->command = cpu_to_le16(smb_buffer->Command);
 		cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command);
-- 
cgit v1.3


From 7f48558e6489d032b1584b0cc9ac4bb11072c034 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Sat, 12 Oct 2013 10:06:03 -0500
Subject: cifs: Send a logoff request before removing a smb session

Send a smb session logoff request before removing smb session off of the list.
On a signed smb session, remvoing a session off of the list before sending
a logoff request results in server returning an error for lack of
smb signature.

Never seen an error during smb logoff, so as per MS-SMB2 3.2.5.1,
not sure how an error during logoff should be retried. So for now,
if a server returns an error to a logoff request, log the error and
remove the session off of the list.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/connect.c       | 25 ++++++++++++++++++++-----
 fs/cifs/smb2transport.c | 10 ++++++++--
 fs/cifs/transport.c     | 11 +++++++++--
 3 files changed, 37 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a279ffc0bc29..62a55147400a 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2242,6 +2242,8 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
 
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+		if (ses->status == CifsExiting)
+			continue;
 		if (!match_session(ses, vol))
 			continue;
 		++ses->ses_count;
@@ -2255,24 +2257,37 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
 static void
 cifs_put_smb_ses(struct cifs_ses *ses)
 {
-	unsigned int xid;
+	unsigned int rc, xid;
 	struct TCP_Server_Info *server = ses->server;
 
 	cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count);
+
 	spin_lock(&cifs_tcp_ses_lock);
+	if (ses->status == CifsExiting) {
+		spin_unlock(&cifs_tcp_ses_lock);
+		return;
+	}
 	if (--ses->ses_count > 0) {
 		spin_unlock(&cifs_tcp_ses_lock);
 		return;
 	}
-
-	list_del_init(&ses->smb_ses_list);
+	if (ses->status == CifsGood)
+		ses->status = CifsExiting;
 	spin_unlock(&cifs_tcp_ses_lock);
 
-	if (ses->status == CifsGood && server->ops->logoff) {
+	if (ses->status == CifsExiting && server->ops->logoff) {
 		xid = get_xid();
-		server->ops->logoff(xid, ses);
+		rc = server->ops->logoff(xid, ses);
+		if (rc)
+			cifs_dbg(VFS, "%s: Session Logoff failure rc=%d\n",
+				__func__, rc);
 		_free_xid(xid);
 	}
+
+	spin_lock(&cifs_tcp_ses_lock);
+	list_del_init(&ses->smb_ses_list);
+	spin_unlock(&cifs_tcp_ses_lock);
+
 	sesInfoFree(ses);
 	cifs_put_tcp_session(server);
 }
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index c523617eade2..59c748ce872f 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -516,13 +516,19 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_hdr *buf,
 		return -EAGAIN;
 	}
 
-	if (ses->status != CifsGood) {
-		/* check if SMB2 session is bad because we are setting it up */
+	if (ses->status == CifsNew) {
 		if ((buf->Command != SMB2_SESSION_SETUP) &&
 		    (buf->Command != SMB2_NEGOTIATE))
 			return -EAGAIN;
 		/* else ok - we are setting up session */
 	}
+
+	if (ses->status == CifsExiting) {
+		if (buf->Command != SMB2_LOGOFF)
+			return -EAGAIN;
+		/* else ok - we are shutting down the session */
+	}
+
 	*mid = smb2_mid_entry_alloc(buf, ses->server);
 	if (*mid == NULL)
 		return -ENOMEM;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 0ee6d249ef6f..b37570952846 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -431,13 +431,20 @@ static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
 		return -EAGAIN;
 	}
 
-	if (ses->status != CifsGood) {
-		/* check if SMB session is bad because we are setting it up */
+	if (ses->status == CifsNew) {
 		if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) &&
 			(in_buf->Command != SMB_COM_NEGOTIATE))
 			return -EAGAIN;
 		/* else ok - we are setting up session */
 	}
+
+	if (ses->status == CifsExiting) {
+		/* check if SMB session is bad because we are setting it up */
+		if (in_buf->Command != SMB_COM_LOGOFF_ANDX)
+			return -EAGAIN;
+		/* else ok - we are shutting down session */
+	}
+
 	*ppmidQ = AllocMidQEntry(in_buf, ses->server);
 	if (*ppmidQ == NULL)
 		return -ENOMEM;
-- 
cgit v1.3


From 2167114c6ea6e76fd84e368bae5389d37dd156aa Mon Sep 17 00:00:00 2001
From: Steven French <smfrench@gmail.com>
Date: Wed, 9 Oct 2013 13:36:35 -0500
Subject: Query device characteristics at mount time from server on SMB2/3 not
 just on cifs mounts

Currently SMB2 and SMB3 mounts do not query the device information at mount time
from the server as is done for cifs.  These can be useful for debugging.
This is a minor patch, that extends the previous one (which added ability to
query file system attributes at mount time - this returns the device
characteristics - also via in /proc/fs/cifs/DebugData)

Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/smb2ops.c   |  5 ++++-
 fs/cifs/smb2pdu.c   | 31 +++++++++++++++++++++++--------
 fs/cifs/smb2proto.h |  2 +-
 3 files changed, 28 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index a53e2053ae9d..79084d67f3fb 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -229,7 +229,10 @@ smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
 	if (rc)
 		return;
 
-	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+			FS_ATTRIBUTE_INFORMATION);
+	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+			FS_DEVICE_INFORMATION);
 	SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
 	return;
 }
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index df12cf8bd979..7887cf50e5fb 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -2358,17 +2358,27 @@ qfsinf_exit:
 
 int
 SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
-	      u64 persistent_fid, u64 volatile_fid)
+	      u64 persistent_fid, u64 volatile_fid, int level)
 {
 	struct smb2_query_info_rsp *rsp = NULL;
 	struct kvec iov;
 	int rc = 0;
-	int resp_buftype;
+	int resp_buftype, max_len, min_len;
 	struct cifs_ses *ses = tcon->ses;
 	unsigned int rsp_len, offset;
 
-	rc = build_qfs_info_req(&iov, tcon, SMB_QUERY_FS_ATTRIBUTE_INFO,
-				sizeof(FILE_SYSTEM_ATTRIBUTE_INFO),
+	if (level == FS_DEVICE_INFORMATION) {
+		max_len = sizeof(FILE_SYSTEM_DEVICE_INFO);
+		min_len = sizeof(FILE_SYSTEM_DEVICE_INFO);
+	} else if (level == FS_ATTRIBUTE_INFORMATION) {
+		max_len = sizeof(FILE_SYSTEM_ATTRIBUTE_INFO);
+		min_len = MIN_FS_ATTR_INFO_SIZE;
+	} else {
+		cifs_dbg(FYI, "Invalid qfsinfo level %d", level);
+		return -EINVAL;
+	}
+
+	rc = build_qfs_info_req(&iov, tcon, level, max_len,
 				persistent_fid, volatile_fid);
 	if (rc)
 		return rc;
@@ -2382,12 +2392,17 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
 
 	rsp_len = le32_to_cpu(rsp->OutputBufferLength);
 	offset = le16_to_cpu(rsp->OutputBufferOffset);
-	rc = validate_buf(offset, rsp_len, &rsp->hdr, MIN_FS_ATTR_INFO_SIZE);
-	if (!rc) {
+	rc = validate_buf(offset, rsp_len, &rsp->hdr, min_len);
+	if (rc)
+		goto qfsattr_exit;
+
+	if (level == FS_ATTRIBUTE_INFORMATION)
 		memcpy(&tcon->fsAttrInfo, 4 /* RFC1001 len */ + offset
 			+ (char *)&rsp->hdr, min_t(unsigned int,
-			rsp_len, sizeof(FILE_SYSTEM_ATTRIBUTE_INFO)));
-	}
+			rsp_len, max_len));
+	else if (level == FS_DEVICE_INFORMATION)
+		memcpy(&tcon->fsDevInfo, 4 /* RFC1001 len */ + offset
+			+ (char *)&rsp->hdr, sizeof(FILE_SYSTEM_DEVICE_INFO));
 
 qfsattr_exit:
 	free_rsp_buf(resp_buftype, iov.iov_base);
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 68dc00d5fb12..313813e4c19b 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -151,7 +151,7 @@ extern int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
 			 u64 persistent_file_id, u64 volatile_file_id,
 			 struct kstatfs *FSData);
 extern int SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
-			 u64 persistent_file_id, u64 volatile_file_id);
+			 u64 persistent_file_id, u64 volatile_file_id, int lvl);
 extern int SMB2_lock(const unsigned int xid, struct cifs_tcon *tcon,
 		     const __u64 persist_fid, const __u64 volatile_fid,
 		     const __u32 pid, const __u64 length, const __u64 offset,
-- 
cgit v1.3


From af6a12ea8d4bb39a87527835b943bde4215897e5 Mon Sep 17 00:00:00 2001
From: Steven French <smfrench@gmail.com>
Date: Wed, 9 Oct 2013 20:55:53 -0500
Subject: Query File System Alignment

In SMB3 it is now possible to query the file system
alignment info, and the preferred (for performance)
sector size and whether the underlying disk
has no seek penalty (like SSD).

Query this information at mount time for SMB3,
and make it visible in /proc/fs/cifs/DebugData
for debugging purposes.

This alignment information and preferred sector
size info will be helpful for the copy offload
patches to setup the right chunks in the CopyChunk
requests.   Presumably the knowledge that the
underlying disk is SSD could also help us
make better readahead and writebehind
decisions (something to look at in the future).

Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsglob.h |  2 ++
 fs/cifs/smb2ops.c  | 44 +++++++++++++++++++++++++++++++++++++++++++-
 fs/cifs/smb2pdu.c  | 12 +++++++++++-
 fs/cifs/smb2pdu.h  | 22 ++++++++++++++++++++--
 4 files changed, 76 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 670da1e55be7..26b1c1dc93f6 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -853,6 +853,8 @@ struct cifs_tcon {
 	__u32 maximal_access;
 	__u32 vol_serial_number;
 	__le64 vol_create_time;
+	__u32 ss_flags;		/* sector size flags */
+	__u32 perf_sector_size; /* best sector size for perf */
 #endif /* CONFIG_CIFS_SMB2 */
 #ifdef CONFIG_CIFS_FSCACHE
 	u64 resource_id;		/* server resource id */
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 79084d67f3fb..25759c89619a 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -209,6 +209,36 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
 	return rsize;
 }
 
+static void
+smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
+{
+	int rc;
+	__le16 srch_path = 0; /* Null - open root of share */
+	u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+	struct cifs_open_parms oparms;
+	struct cifs_fid fid;
+
+	oparms.tcon = tcon;
+	oparms.desired_access = FILE_READ_ATTRIBUTES;
+	oparms.disposition = FILE_OPEN;
+	oparms.create_options = 0;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL);
+	if (rc)
+		return;
+
+	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+			FS_ATTRIBUTE_INFORMATION);
+	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+			FS_DEVICE_INFORMATION);
+	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+			FS_SECTOR_SIZE_INFORMATION); /* SMB3 specific */
+	SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+	return;
+}
+
 static void
 smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
 {
@@ -332,7 +362,19 @@ smb2_dump_share_caps(struct seq_file *m, struct cifs_tcon *tcon)
 		seq_puts(m, " ASYMMETRIC,");
 	if (tcon->capabilities == 0)
 		seq_puts(m, " None");
+	if (tcon->ss_flags & SSINFO_FLAGS_ALIGNED_DEVICE)
+		seq_puts(m, " Aligned,");
+	if (tcon->ss_flags & SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE)
+		seq_puts(m, " Partition Aligned,");
+	if (tcon->ss_flags & SSINFO_FLAGS_NO_SEEK_PENALTY)
+		seq_puts(m, " SSD,");
+	if (tcon->ss_flags & SSINFO_FLAGS_TRIM_ENABLED)
+		seq_puts(m, " TRIM-support,");
+
 	seq_printf(m, "\tShare Flags: 0x%x", tcon->share_flags);
+	if (tcon->perf_sector_size)
+		seq_printf(m, "\tOptimal sector size: 0x%x",
+			   tcon->perf_sector_size);
 }
 
 static void
@@ -1048,7 +1090,7 @@ struct smb_version_operations smb30_operations = {
 	.logoff = SMB2_logoff,
 	.tree_connect = SMB2_tcon,
 	.tree_disconnect = SMB2_tdis,
-	.qfs_tcon = smb2_qfs_tcon,
+	.qfs_tcon = smb3_qfs_tcon,
 	.is_path_accessible = smb2_is_path_accessible,
 	.can_echo = smb2_can_echo,
 	.echo = SMB2_echo,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 7887cf50e5fb..8ab05b0d6778 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -2373,8 +2373,11 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
 	} else if (level == FS_ATTRIBUTE_INFORMATION) {
 		max_len = sizeof(FILE_SYSTEM_ATTRIBUTE_INFO);
 		min_len = MIN_FS_ATTR_INFO_SIZE;
+	} else if (level == FS_SECTOR_SIZE_INFORMATION) {
+		max_len = sizeof(struct smb3_fs_ss_info);
+		min_len = sizeof(struct smb3_fs_ss_info);
 	} else {
-		cifs_dbg(FYI, "Invalid qfsinfo level %d", level);
+		cifs_dbg(FYI, "Invalid qfsinfo level %d\n", level);
 		return -EINVAL;
 	}
 
@@ -2403,6 +2406,13 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
 	else if (level == FS_DEVICE_INFORMATION)
 		memcpy(&tcon->fsDevInfo, 4 /* RFC1001 len */ + offset
 			+ (char *)&rsp->hdr, sizeof(FILE_SYSTEM_DEVICE_INFO));
+	else if (level == FS_SECTOR_SIZE_INFORMATION) {
+		struct smb3_fs_ss_info *ss_info = (struct smb3_fs_ss_info *)
+			(4 /* RFC1001 len */ + offset + (char *)&rsp->hdr);
+		tcon->ss_flags = le32_to_cpu(ss_info->Flags);
+		tcon->perf_sector_size =
+			le32_to_cpu(ss_info->PhysicalBytesPerSectorForPerf);
+	}
 
 qfsattr_exit:
 	free_rsp_buf(resp_buftype, iov.iov_base);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index c7c3c8294d1a..7e44f18cc169 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -877,14 +877,16 @@ struct smb2_lease_ack {
 
 /* File System Information Classes */
 #define FS_VOLUME_INFORMATION		1 /* Query */
-#define FS_LABEL_INFORMATION		2 /* Set */
+#define FS_LABEL_INFORMATION		2 /* Local only */
 #define FS_SIZE_INFORMATION		3 /* Query */
 #define FS_DEVICE_INFORMATION		4 /* Query */
 #define FS_ATTRIBUTE_INFORMATION	5 /* Query */
 #define FS_CONTROL_INFORMATION		6 /* Query, Set */
 #define FS_FULL_SIZE_INFORMATION	7 /* Query */
 #define FS_OBJECT_ID_INFORMATION	8 /* Query, Set */
-#define FS_DRIVER_PATH_INFORMATION	9 /* Query */
+#define FS_DRIVER_PATH_INFORMATION	9 /* Local only */
+#define FS_VOLUME_FLAGS_INFORMATION	10 /* Local only */
+#define FS_SECTOR_SIZE_INFORMATION	11 /* SMB3 or later. Query */
 
 struct smb2_fs_full_size_info {
 	__le64 TotalAllocationUnits;
@@ -894,6 +896,22 @@ struct smb2_fs_full_size_info {
 	__le32 BytesPerSector;
 } __packed;
 
+#define SSINFO_FLAGS_ALIGNED_DEVICE		0x00000001
+#define SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE 0x00000002
+#define SSINFO_FLAGS_NO_SEEK_PENALTY		0x00000004
+#define SSINFO_FLAGS_TRIM_ENABLED		0x00000008
+
+/* sector size info struct */
+struct smb3_fs_ss_info {
+	__le32 LogicalBytesPerSector;
+	__le32 PhysicalBytesPerSectorForAtomicity;
+	__le32 PhysicalBytesPerSectorForPerf;
+	__le32 FileSystemEffectivePhysicalBytesPerSectorForAtomicity;
+	__le32 Flags;
+	__le32 ByteOffsetForSectorAlignment;
+	__le32 ByteOffsetForPartitionAlignment;
+} __packed;
+
 /* partial list of QUERY INFO levels */
 #define FILE_DIRECTORY_INFORMATION	1
 #define FILE_FULL_DIRECTORY_INFORMATION 2
-- 
cgit v1.3


From c7f508a99bf229963915e79a603e0618d1d2ba76 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Mon, 14 Oct 2013 15:27:32 -0500
Subject: Allow setting per-file compression via CIFS protocol

An earlier patch allowed setting the per-file compression flag

"chattr +c filename"

on an smb2 or smb3 mount, and also allowed lsattr to return
whether a file on a cifs, or smb2/smb3 mount was compressed.

This patch extends the ability to set the per-file
compression flag to the cifs protocol, which uses a somewhat
different IOCTL mechanism than SMB2, although the payload
(the flags stored in the compression_state) are the same.

Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifspdu.h   | 29 ++++++++++++++++++++++++++++
 fs/cifs/cifsproto.h |  2 ++
 fs/cifs/cifssmb.c   | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/cifs/smb1ops.c   |  8 ++++++++
 fs/cifs/smb2pdu.h   |  5 +----
 5 files changed, 94 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index f80f98f5ef9c..9e5ee34de986 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1352,6 +1352,35 @@ typedef struct smb_com_transaction_ioctl_req {
 	__u8 Data[1];
 } __attribute__((packed)) TRANSACT_IOCTL_REQ;
 
+typedef struct smb_com_transaction_compr_ioctl_req {
+	struct smb_hdr hdr;	/* wct = 23 */
+	__u8 MaxSetupCount;
+	__u16 Reserved;
+	__le32 TotalParameterCount;
+	__le32 TotalDataCount;
+	__le32 MaxParameterCount;
+	__le32 MaxDataCount;
+	__le32 ParameterCount;
+	__le32 ParameterOffset;
+	__le32 DataCount;
+	__le32 DataOffset;
+	__u8 SetupCount; /* four setup words follow subcommand */
+	/* SNIA spec incorrectly included spurious pad here */
+	__le16 SubCommand; /* 2 = IOCTL/FSCTL */
+	__le32 FunctionCode;
+	__u16 Fid;
+	__u8 IsFsctl;  /* 1 = File System Control 0 = device control (IOCTL) */
+	__u8 IsRootFlag; /* 1 = apply command to root of share (must be DFS) */
+	__le16 ByteCount;
+	__u8 Pad[3];
+	__le16 compression_state;  /* See below for valid flags */
+} __attribute__((packed)) TRANSACT_COMPR_IOCTL_REQ;
+
+/* compression state flags */
+#define COMPRESSION_FORMAT_NONE		0x0000
+#define COMPRESSION_FORMAT_DEFAULT	0x0001
+#define COMPRESSION_FORMAT_LZNT1	0x0002
+
 typedef struct smb_com_transaction_ioctl_rsp {
 	struct smb_hdr hdr;	/* wct = 19 */
 	__u8 Reserved[3];
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index b5ec2a268f56..aa3397620342 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -360,6 +360,8 @@ extern int CIFSSMBUnixQuerySymLink(const unsigned int xid,
 extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
 			       __u16 fid, char **symlinkinfo,
 			       const struct nls_table *nls_codepage);
+extern int CIFSSMB_set_compression(const unsigned int xid,
+				   struct cifs_tcon *tcon, __u16 fid);
 extern int CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon,
 			const char *fileName, const int disposition,
 			const int access_flags, const int omode,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index ccd31ab815d4..93b29474714a 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3199,6 +3199,60 @@ qreparse_out:
 	return rc;
 }
 
+int
+CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+		    __u16 fid)
+{
+	int rc = 0;
+	int bytes_returned;
+	struct smb_com_transaction_compr_ioctl_req *pSMB;
+	struct smb_com_transaction_ioctl_rsp *pSMBr;
+
+	cifs_dbg(FYI, "Set compression for %u\n", fid);
+	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
+
+	pSMB->TotalParameterCount = 0;
+	pSMB->TotalDataCount = __constant_cpu_to_le32(2);
+	pSMB->MaxParameterCount = 0;
+	pSMB->MaxDataCount = 0;
+	pSMB->MaxSetupCount = 4;
+	pSMB->Reserved = 0;
+	pSMB->ParameterOffset = 0;
+	pSMB->DataCount = __constant_cpu_to_le32(2);
+	pSMB->DataOffset =
+		cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req,
+				compression_state) - 4);  /* 84 */
+	pSMB->SetupCount = 4;
+	pSMB->SubCommand = __constant_cpu_to_le16(NT_TRANSACT_IOCTL);
+	pSMB->ParameterCount = 0;
+	pSMB->FunctionCode = __constant_cpu_to_le32(FSCTL_SET_COMPRESSION);
+	pSMB->IsFsctl = 1; /* FSCTL */
+	pSMB->IsRootFlag = 0;
+	pSMB->Fid = fid; /* file handle always le */
+	/* 3 byte pad, followed by 2 byte compress state */
+	pSMB->ByteCount = __constant_cpu_to_le16(5);
+	inc_rfc1001_len(pSMB, 5);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc)
+		cifs_dbg(FYI, "Send error in SetCompression = %d\n", rc);
+
+	cifs_buf_release(pSMB);
+
+	/*
+	 * Note: On -EAGAIN error only caller can retry on handle based calls
+	 * since file handle passed in no longer valid.
+	 */
+	return rc;
+}
+
+
 #ifdef CONFIG_CIFS_POSIX
 
 /*Convert an Access Control Entry from wire format to local POSIX xattr format*/
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 09ef8f322c2a..384cffe42850 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -806,6 +806,13 @@ out:
 	return rc;
 }
 
+static int
+cifs_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+		   struct cifsFileInfo *cfile)
+{
+	return CIFSSMB_set_compression(xid, tcon, cfile->fid.netfid);
+}
+
 static int
 cifs_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
 		     const char *path, struct cifs_sb_info *cifs_sb,
@@ -956,6 +963,7 @@ struct smb_version_operations smb1_operations = {
 	.set_path_size = CIFSSMBSetEOF,
 	.set_file_size = CIFSSMBSetFileSize,
 	.set_file_info = smb_set_file_info,
+	.set_compression = cifs_set_compression,
 	.echo = CIFSSMBEcho,
 	.mkdir = CIFSSMBMkDir,
 	.mkdir_setinfo = cifs_mkdir_setinfo,
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 7e44f18cc169..6183b1b7550f 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -570,12 +570,9 @@ struct network_interface_info_ioctl_rsp {
 #define NO_FILE_ID 0xFFFFFFFFFFFFFFFFULL /* general ioctls to srv not to file */
 
 struct compress_ioctl {
-	__le16 CompressionState;
+	__le16 CompressionState; /* See cifspdu.h for possible flag values */
 } __packed;
 
-#define COMPRESSION_FORMAT_NONE		0x0000
-#define COMPRESSION_FORMAT_DEFAULT	0x0001
-#define COMPRESSION_FORMAT_LZNT1	0x0002
 struct smb2_ioctl_req {
 	struct smb2_hdr hdr;
 	__le16 StructureSize;	/* Must be 57 */
-- 
cgit v1.3


From f10d9ba405ed250d647d6d810f8fc5ff2f62f370 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Sun, 13 Oct 2013 22:32:30 -0500
Subject: Fix unused variable warning when CIFS POSIX disabled

Fix unused variable warning when CONFIG_CIFS_POSIX disabled.

   fs/cifs/ioctl.c: In function 'cifs_ioctl':
>> fs/cifs/ioctl.c:40:8: warning: unused variable 'ExtAttrMask' [-Wunused-variable]
     __u64 ExtAttrMask = 0;
           ^
Pointed out by 0-DAY kernel build testing backend

Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 029867078aff..ba54bf6ab116 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -37,7 +37,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 	struct cifsFileInfo *pSMBFile = filep->private_data;
 	struct cifs_tcon *tcon;
 	__u64	ExtAttrBits = 0;
-	__u64	ExtAttrMask = 0;
 	__u64   caps;
 
 	xid = get_xid();
@@ -54,6 +53,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 			caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
 #ifdef CONFIG_CIFS_POSIX
 			if (CIFS_UNIX_EXTATTR_CAP & caps) {
+				__u64	ExtAttrMask = 0;
 				rc = CIFSGetExtAttr(xid, tcon,
 						    pSMBFile->fid.netfid,
 						    &ExtAttrBits, &ExtAttrMask);
-- 
cgit v1.3


From c481e9feee78c6ce1ba0a1c8c892049f6514f6cf Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Mon, 14 Oct 2013 01:21:53 -0500
Subject: Query network adapter info at mount time for debugging

When CONFIG_CIFS_STATS2 enabled query adapter info for debugging

It is easy now in SMB3 to query the information about the server's
network interfaces (and at least Windows 8 and above do this, if not
other clients) there are some useful pieces of information you can get
including:

- all of the network interfaces that the server advertises (not just
the one you are mounting over), and with SMB3 supporting multichannel
this helps with more than just failover (also aggregating multiple
sockets under one mount)

- whether the adapter supports RSS (useful to know if you want to
estimate whether setting up two or more socket connections to the same
address is going to be faster due to RSS offload in the adapter)

- whether the server supports RDMA

- whether the server has IPv6 interfaces (if you connected over IPv4
but prefer IPv6 e.g.)

- what the link speed is (you might want to reconnect over a higher
speed interface if available)

(Of course we could also rerequest this on every mount cheaplly to the
same server, as Windows apparently does, so we can update the adapter
info on new mounts, and also on every reconnect if the network
interface drops temporarily - so we don't have to rely on info from
the first mount to this server)

It is trivial to request this information - and certainly will be useful
when we get to the point of doing multichannel (and eventually RDMA),
but some of this (linkspeed etc.) info may help for debugging in
the meantime.  Enable this request when CONFIG_CIFS_STATS2 is on
(only for smb3 mounts since it is an SMB3 or later ioctl).

Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/smb2ops.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 25759c89619a..c571be8cb76e 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -209,6 +209,32 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
 	return rsize;
 }
 
+#ifdef CONFIG_CIFS_STATS2
+static int
+SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
+{
+	int rc;
+	unsigned int ret_data_len = 0;
+	struct network_interface_info_ioctl_rsp *out_buf;
+
+	rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
+			FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */,
+			NULL /* no data input */, 0 /* no data input */,
+			(char **)&out_buf, &ret_data_len);
+
+	if ((rc == 0)  && (ret_data_len > 0)) {
+		/* Dump info on first interface */
+		cifs_dbg(FYI, "Adapter Capability 0x%x\t",
+			le32_to_cpu(out_buf->Capability));
+		cifs_dbg(FYI, "Link Speed %lld\n",
+			le64_to_cpu(out_buf->LinkSpeed));
+	} else
+		cifs_dbg(VFS, "error %d on ioctl to get interface list\n", rc);
+
+	return rc;
+}
+#endif /* STATS2 */
+
 static void
 smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
 {
@@ -229,6 +255,10 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
 	if (rc)
 		return;
 
+#ifdef CONFIG_CIFS_STATS2
+	SMB3_request_interfaces(xid, tcon);
+#endif /* STATS2 */
+
 	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
 			FS_ATTRIBUTE_INFORMATION);
 	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
-- 
cgit v1.3


From 4bf08ff6f9f381fa625d65e36c3e8e57313943cf Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Mon, 4 Nov 2013 10:28:33 +0800
Subject: f2fs: remove unnecessary TestClearPageError when wait pages writeback

In wait_on_node_pages_writeback we will test and clear error flag for all
pages in radix tree, but not necessary.
So we only do this for pages belong to the specified inode.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/node.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b527ed4c0225..4ac4150d421d 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1170,10 +1170,11 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
 			if (page->index > end)
 				continue;
 
-			if (ino && ino_of_node(page) == ino)
+			if (ino && ino_of_node(page) == ino) {
 				wait_on_page_writeback(page);
-			if (TestClearPageError(page))
-				ret = -EIO;
+				if (TestClearPageError(page))
+					ret = -EIO;
+			}
 		}
 		pagevec_release(&pvec);
 		cond_resched();
-- 
cgit v1.3


From 9b9f039d570bddc1653cda2e38f4331dcacfcde5 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 1 Nov 2013 14:52:06 -0400
Subject: GFS2: Use reflink for quota data cache

This patch adds reflink support to the quota data cache. It
looks a bit strange because we still don't have a sensible
split in the lookup by id and the lru list. That is coming in
later patches though.

The intent here is just to swap the current ref count for
reflinks in all cases with as little as possible other change.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Abhijith Das <adas@redhat.com>
Tested-by: Abhijith Das <adas@redhat.com>
---
 fs/gfs2/incore.h |  2 +-
 fs/gfs2/quota.c  | 42 ++++++++++++++++++++++++++++--------------
 2 files changed, 29 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index bb88e417231f..9d778044cc6e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -422,7 +422,7 @@ struct gfs2_quota_data {
 	struct list_head qd_list;
 	struct list_head qd_reclaim;
 
-	atomic_t qd_count;
+	struct lockref qd_lockref;
 
 	struct kqid qd_id;
 	unsigned long qd_flags;		/* QDF_... */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 4a9726aa191f..ed089118c171 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -50,6 +50,7 @@
 #include <linux/freezer.h>
 #include <linux/quota.h>
 #include <linux/dqblk_xfs.h>
+#include <linux/lockref.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -148,7 +149,8 @@ static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid,
 	if (!qd)
 		return -ENOMEM;
 
-	atomic_set(&qd->qd_count, 1);
+	qd->qd_lockref.count = 1;
+	spin_lock_init(&qd->qd_lockref.lock);
 	qd->qd_id = qid;
 	qd->qd_slot = -1;
 	INIT_LIST_HEAD(&qd->qd_reclaim);
@@ -180,13 +182,12 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
 		spin_lock(&qd_lru_lock);
 		list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
 			if (qid_eq(qd->qd_id, qid)) {
-				if (!atomic_read(&qd->qd_count) &&
-				    !list_empty(&qd->qd_reclaim)) {
+				lockref_get(&qd->qd_lockref);
+				if (!list_empty(&qd->qd_reclaim)) {
 					/* Remove it from reclaim list */
 					list_del_init(&qd->qd_reclaim);
 					atomic_dec(&qd_lru_count);
 				}
-				atomic_inc(&qd->qd_count);
 				found = 1;
 				break;
 			}
@@ -222,18 +223,24 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
 static void qd_hold(struct gfs2_quota_data *qd)
 {
 	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
-	gfs2_assert(sdp, atomic_read(&qd->qd_count));
-	atomic_inc(&qd->qd_count);
+	gfs2_assert(sdp, !__lockref_is_dead(&qd->qd_lockref));
+	lockref_get(&qd->qd_lockref);
 }
 
 static void qd_put(struct gfs2_quota_data *qd)
 {
-	if (atomic_dec_and_lock(&qd->qd_count, &qd_lru_lock)) {
+	spin_lock(&qd_lru_lock);
+
+	if (!lockref_put_or_lock(&qd->qd_lockref)) {
+
 		/* Add to the reclaim list */
 		list_add_tail(&qd->qd_reclaim, &qd_lru_list);
 		atomic_inc(&qd_lru_count);
-		spin_unlock(&qd_lru_lock);
+
+		spin_unlock(&qd->qd_lockref.lock);
 	}
+
+	spin_unlock(&qd_lru_lock);
 }
 
 static int slot_get(struct gfs2_quota_data *qd)
@@ -394,8 +401,8 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd,
 	list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
 
 	set_bit(QDF_LOCKED, &qd->qd_flags);
-	gfs2_assert_warn(sdp, atomic_read(&qd->qd_count));
-	atomic_inc(&qd->qd_count);
+	gfs2_assert_warn(sdp, !__lockref_is_dead(&qd->qd_lockref));
+	lockref_get(&qd->qd_lockref);
 	qd->qd_change_sync = qd->qd_change;
 	gfs2_assert_warn(sdp, qd->qd_slot_count);
 	qd->qd_slot_count++;
@@ -1303,15 +1310,22 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 	while (!list_empty(head)) {
 		qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
 
-		if (atomic_read(&qd->qd_count) > 1 ||
-		    (atomic_read(&qd->qd_count) &&
-		     !test_bit(QDF_CHANGE, &qd->qd_flags))) {
+		/*
+		 * To be removed in due course... we should be able to
+		 * ensure that all refs to the qd have done by this point
+		 * so that this rather odd test is not required
+		 */
+		spin_lock(&qd->qd_lockref.lock);
+		if (qd->qd_lockref.count > 1 ||
+		    (qd->qd_lockref.count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
+			spin_unlock(&qd->qd_lockref.lock);
 			list_move(&qd->qd_list, head);
 			spin_unlock(&qd_lru_lock);
 			schedule();
 			spin_lock(&qd_lru_lock);
 			continue;
 		}
+		spin_unlock(&qd->qd_lockref.lock);
 
 		list_del(&qd->qd_list);
 		/* Also remove if this qd exists in the reclaim list */
@@ -1322,7 +1336,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 		atomic_dec(&sdp->sd_quota_count);
 		spin_unlock(&qd_lru_lock);
 
-		if (!atomic_read(&qd->qd_count)) {
+		if (!qd->qd_lockref.count) {
 			gfs2_assert_warn(sdp, !qd->qd_change);
 			gfs2_assert_warn(sdp, !qd->qd_slot_count);
 		} else
-- 
cgit v1.3


From 7d80823e1d83e35977d77ae201bf63af3317ad0a Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 1 Nov 2013 14:52:08 -0400
Subject: GFS2: Rename quota qd_lru_lock qd_lock

This is a straight forward rename which is in preparation for
introducing the generic list_lru infrastructure in the
following patch.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Abhijith Das <adas@redhat.com>
Tested-by: Abhijith Das <adas@redhat.com>
---
 fs/gfs2/quota.c | 70 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 35 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index ed089118c171..466516ac5e57 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -74,7 +74,7 @@ struct gfs2_quota_change_host {
 
 static LIST_HEAD(qd_lru_list);
 static atomic_t qd_lru_count = ATOMIC_INIT(0);
-static DEFINE_SPINLOCK(qd_lru_lock);
+static DEFINE_SPINLOCK(qd_lock);
 
 unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
 				  struct shrink_control *sc)
@@ -87,7 +87,7 @@ unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
 	if (!(sc->gfp_mask & __GFP_FS))
 		return SHRINK_STOP;
 
-	spin_lock(&qd_lru_lock);
+	spin_lock(&qd_lock);
 	while (nr_to_scan && !list_empty(&qd_lru_list)) {
 		qd = list_entry(qd_lru_list.next,
 				struct gfs2_quota_data, qd_reclaim);
@@ -106,13 +106,13 @@ unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
 		/* Delete it from the common reclaim list */
 		list_del_init(&qd->qd_reclaim);
 		atomic_dec(&qd_lru_count);
-		spin_unlock(&qd_lru_lock);
+		spin_unlock(&qd_lock);
 		kmem_cache_free(gfs2_quotad_cachep, qd);
-		spin_lock(&qd_lru_lock);
+		spin_lock(&qd_lock);
 		nr_to_scan--;
 		freed++;
 	}
-	spin_unlock(&qd_lru_lock);
+	spin_unlock(&qd_lock);
 	return freed;
 }
 
@@ -179,7 +179,7 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
 
 	for (;;) {
 		found = 0;
-		spin_lock(&qd_lru_lock);
+		spin_lock(&qd_lock);
 		list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
 			if (qid_eq(qd->qd_id, qid)) {
 				lockref_get(&qd->qd_lockref);
@@ -203,7 +203,7 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
 			new_qd = NULL;
 		}
 
-		spin_unlock(&qd_lru_lock);
+		spin_unlock(&qd_lock);
 
 		if (qd) {
 			if (new_qd) {
@@ -229,7 +229,7 @@ static void qd_hold(struct gfs2_quota_data *qd)
 
 static void qd_put(struct gfs2_quota_data *qd)
 {
-	spin_lock(&qd_lru_lock);
+	spin_lock(&qd_lock);
 
 	if (!lockref_put_or_lock(&qd->qd_lockref)) {
 
@@ -240,7 +240,7 @@ static void qd_put(struct gfs2_quota_data *qd)
 		spin_unlock(&qd->qd_lockref.lock);
 	}
 
-	spin_unlock(&qd_lru_lock);
+	spin_unlock(&qd_lock);
 }
 
 static int slot_get(struct gfs2_quota_data *qd)
@@ -249,10 +249,10 @@ static int slot_get(struct gfs2_quota_data *qd)
 	unsigned int c, o = 0, b;
 	unsigned char byte = 0;
 
-	spin_lock(&qd_lru_lock);
+	spin_lock(&qd_lock);
 
 	if (qd->qd_slot_count++) {
-		spin_unlock(&qd_lru_lock);
+		spin_unlock(&qd_lock);
 		return 0;
 	}
 
@@ -276,13 +276,13 @@ found:
 
 	sdp->sd_quota_bitmap[c][o] |= 1 << b;
 
-	spin_unlock(&qd_lru_lock);
+	spin_unlock(&qd_lock);
 
 	return 0;
 
 fail:
 	qd->qd_slot_count--;
-	spin_unlock(&qd_lru_lock);
+	spin_unlock(&qd_lock);
 	return -ENOSPC;
 }
 
@@ -290,10 +290,10 @@ static void slot_hold(struct gfs2_quota_data *qd)
 {
 	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
 
-	spin_lock(&qd_lru_lock);
+	spin_lock(&qd_lock);
 	gfs2_assert(sdp, qd->qd_slot_count);
 	qd->qd_slot_count++;
-	spin_unlock(&qd_lru_lock);
+	spin_unlock(&qd_lock);
 }
 
 static void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
@@ -320,13 +320,13 @@ static void slot_put(struct gfs2_quota_data *qd)
 {
 	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
 
-	spin_lock(&qd_lru_lock);
+	spin_lock(&qd_lock);
 	gfs2_assert(sdp, qd->qd_slot_count);
 	if (!--qd->qd_slot_count) {
 		gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
 		qd->qd_slot = -1;
 	}
-	spin_unlock(&qd_lru_lock);
+	spin_unlock(&qd_lock);
 }
 
 static int bh_get(struct gfs2_quota_data *qd)
@@ -420,7 +420,7 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
 	if (sdp->sd_vfs->s_flags & MS_RDONLY)
 		return 0;
 
-	spin_lock(&qd_lru_lock);
+	spin_lock(&qd_lock);
 
 	list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
 		found = qd_check_sync(sdp, qd, &sdp->sd_quota_sync_gen);
@@ -431,7 +431,7 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
 	if (!found)
 		qd = NULL;
 
-	spin_unlock(&qd_lru_lock);
+	spin_unlock(&qd_lock);
 
 	if (qd) {
 		gfs2_assert_warn(sdp, qd->qd_change_sync);
@@ -598,9 +598,9 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
 	x = be64_to_cpu(qc->qc_change) + change;
 	qc->qc_change = cpu_to_be64(x);
 
-	spin_lock(&qd_lru_lock);
+	spin_lock(&qd_lock);
 	qd->qd_change = x;
-	spin_unlock(&qd_lru_lock);
+	spin_unlock(&qd_lock);
 
 	if (!x) {
 		gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags));
@@ -972,9 +972,9 @@ static int need_sync(struct gfs2_quota_data *qd)
 	if (!qd->qd_qb.qb_limit)
 		return 0;
 
-	spin_lock(&qd_lru_lock);
+	spin_lock(&qd_lock);
 	value = qd->qd_change;
-	spin_unlock(&qd_lru_lock);
+	spin_unlock(&qd_lock);
 
 	spin_lock(&gt->gt_spin);
 	num = gt->gt_quota_scale_num;
@@ -1019,9 +1019,9 @@ void gfs2_quota_unlock(struct gfs2_inode *ip)
 		if (!sync)
 			continue;
 
-		spin_lock(&qd_lru_lock);
+		spin_lock(&qd_lock);
 		found = qd_check_sync(sdp, qd, NULL);
-		spin_unlock(&qd_lru_lock);
+		spin_unlock(&qd_lock);
 
 		if (!found)
 			continue;
@@ -1083,9 +1083,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
 			continue;
 
 		value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
-		spin_lock(&qd_lru_lock);
+		spin_lock(&qd_lock);
 		value += qd->qd_change;
-		spin_unlock(&qd_lru_lock);
+		spin_unlock(&qd_lock);
 
 		if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
 			print_message(qd, "exceeded");
@@ -1276,11 +1276,11 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
 			qd->qd_slot = slot;
 			qd->qd_slot_count = 1;
 
-			spin_lock(&qd_lru_lock);
+			spin_lock(&qd_lock);
 			gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
 			list_add(&qd->qd_list, &sdp->sd_quota_list);
 			atomic_inc(&sdp->sd_quota_count);
-			spin_unlock(&qd_lru_lock);
+			spin_unlock(&qd_lock);
 
 			found++;
 		}
@@ -1306,7 +1306,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 	struct gfs2_quota_data *qd;
 	unsigned int x;
 
-	spin_lock(&qd_lru_lock);
+	spin_lock(&qd_lock);
 	while (!list_empty(head)) {
 		qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
 
@@ -1320,9 +1320,9 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 		    (qd->qd_lockref.count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
 			spin_unlock(&qd->qd_lockref.lock);
 			list_move(&qd->qd_list, head);
-			spin_unlock(&qd_lru_lock);
+			spin_unlock(&qd_lock);
 			schedule();
-			spin_lock(&qd_lru_lock);
+			spin_lock(&qd_lock);
 			continue;
 		}
 		spin_unlock(&qd->qd_lockref.lock);
@@ -1334,7 +1334,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 			atomic_dec(&qd_lru_count);
 		}
 		atomic_dec(&sdp->sd_quota_count);
-		spin_unlock(&qd_lru_lock);
+		spin_unlock(&qd_lock);
 
 		if (!qd->qd_lockref.count) {
 			gfs2_assert_warn(sdp, !qd->qd_change);
@@ -1346,9 +1346,9 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 		gfs2_glock_put(qd->qd_gl);
 		kmem_cache_free(gfs2_quotad_cachep, qd);
 
-		spin_lock(&qd_lru_lock);
+		spin_lock(&qd_lock);
 	}
-	spin_unlock(&qd_lru_lock);
+	spin_unlock(&qd_lock);
 
 	gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
 
-- 
cgit v1.3


From 2147dbfd059eb7fefcfd5934f74f25f0693d4a1f Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 4 Nov 2013 10:15:08 +0000
Subject: GFS2: Use generic list_lru for quota

By using the generic list_lru code, we can now separate the
per sb quota list locking from the lru locking. The lru
lock is made into the inner-most lock.

As a result of this new lock order, we may occasionally see
items on the per-sb quota list which are "dead" so that the
two places where we traverse that list are updated to take
account of that.

As a result of this patch, the gfs2 quota shrinker is now
NUMA zone aware, and we are also laying the foundations for
further improvments in due course.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Abhijith Das <adas@redhat.com>
Tested-by: Abhijith Das <adas@redhat.com>
Cc: Dave Chinner <dchinner@redhat.com>
---
 fs/gfs2/incore.h |   5 +--
 fs/gfs2/main.c   |  19 ++++-----
 fs/gfs2/quota.c  | 118 ++++++++++++++++++++++++++++++++-----------------------
 fs/gfs2/quota.h  |   9 ++---
 4 files changed, 85 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 9d778044cc6e..ba1ea67f4eeb 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -420,11 +420,10 @@ enum {
 
 struct gfs2_quota_data {
 	struct list_head qd_list;
-	struct list_head qd_reclaim;
-
+	struct kqid qd_id;
 	struct lockref qd_lockref;
+	struct list_head qd_lru;
 
-	struct kqid qd_id;
 	unsigned long qd_flags;		/* QDF_... */
 
 	s64 qd_change;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 351586e24e30..0650db2541ef 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -31,12 +31,6 @@
 
 struct workqueue_struct *gfs2_control_wq;
 
-static struct shrinker qd_shrinker = {
-	.count_objects = gfs2_qd_shrink_count,
-	.scan_objects = gfs2_qd_shrink_scan,
-	.seeks = DEFAULT_SEEKS,
-};
-
 static void gfs2_init_inode_once(void *foo)
 {
 	struct gfs2_inode *ip = foo;
@@ -87,6 +81,10 @@ static int __init init_gfs2_fs(void)
 	if (error)
 		return error;
 
+	error = list_lru_init(&gfs2_qd_lru);
+	if (error)
+		goto fail_lru;
+
 	error = gfs2_glock_init();
 	if (error)
 		goto fail;
@@ -139,7 +137,7 @@ static int __init init_gfs2_fs(void)
 	if (!gfs2_rsrv_cachep)
 		goto fail;
 
-	register_shrinker(&qd_shrinker);
+	register_shrinker(&gfs2_qd_shrinker);
 
 	error = register_filesystem(&gfs2_fs_type);
 	if (error)
@@ -179,7 +177,9 @@ fail_wq:
 fail_unregister:
 	unregister_filesystem(&gfs2_fs_type);
 fail:
-	unregister_shrinker(&qd_shrinker);
+	list_lru_destroy(&gfs2_qd_lru);
+fail_lru:
+	unregister_shrinker(&gfs2_qd_shrinker);
 	gfs2_glock_exit();
 
 	if (gfs2_rsrv_cachep)
@@ -214,13 +214,14 @@ fail:
 
 static void __exit exit_gfs2_fs(void)
 {
-	unregister_shrinker(&qd_shrinker);
+	unregister_shrinker(&gfs2_qd_shrinker);
 	gfs2_glock_exit();
 	gfs2_unregister_debugfs();
 	unregister_filesystem(&gfs2_fs_type);
 	unregister_filesystem(&gfs2meta_fs_type);
 	destroy_workqueue(gfs_recovery_wq);
 	destroy_workqueue(gfs2_control_wq);
+	list_lru_destroy(&gfs2_qd_lru);
 
 	rcu_barrier();
 
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 466516ac5e57..453b50eaddec 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -51,6 +51,7 @@
 #include <linux/quota.h>
 #include <linux/dqblk_xfs.h>
 #include <linux/lockref.h>
+#include <linux/list_lru.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -72,29 +73,25 @@ struct gfs2_quota_change_host {
 	struct kqid qc_id;
 };
 
-static LIST_HEAD(qd_lru_list);
-static atomic_t qd_lru_count = ATOMIC_INIT(0);
+/* Lock order: qd_lock -> qd->lockref.lock -> lru lock */
 static DEFINE_SPINLOCK(qd_lock);
+struct list_lru gfs2_qd_lru;
 
-unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
-				  struct shrink_control *sc)
+static void gfs2_qd_dispose(struct list_head *list)
 {
 	struct gfs2_quota_data *qd;
 	struct gfs2_sbd *sdp;
-	int nr_to_scan = sc->nr_to_scan;
-	long freed = 0;
 
-	if (!(sc->gfp_mask & __GFP_FS))
-		return SHRINK_STOP;
-
-	spin_lock(&qd_lock);
-	while (nr_to_scan && !list_empty(&qd_lru_list)) {
-		qd = list_entry(qd_lru_list.next,
-				struct gfs2_quota_data, qd_reclaim);
+	while (!list_empty(list)) {
+		qd = list_entry(list->next, struct gfs2_quota_data, qd_lru);
 		sdp = qd->qd_gl->gl_sbd;
 
+		list_del(&qd->qd_lru);
+
 		/* Free from the filesystem-specific list */
+		spin_lock(&qd_lock);
 		list_del(&qd->qd_list);
+		spin_unlock(&qd_lock);
 
 		gfs2_assert_warn(sdp, !qd->qd_change);
 		gfs2_assert_warn(sdp, !qd->qd_slot_count);
@@ -104,24 +101,59 @@ unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
 		atomic_dec(&sdp->sd_quota_count);
 
 		/* Delete it from the common reclaim list */
-		list_del_init(&qd->qd_reclaim);
-		atomic_dec(&qd_lru_count);
-		spin_unlock(&qd_lock);
 		kmem_cache_free(gfs2_quotad_cachep, qd);
-		spin_lock(&qd_lock);
-		nr_to_scan--;
-		freed++;
 	}
-	spin_unlock(&qd_lock);
+}
+
+
+static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, void *arg)
+{
+	struct list_head *dispose = arg;
+	struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru);
+
+	if (!spin_trylock(&qd->qd_lockref.lock))
+		return LRU_SKIP;
+
+	if (qd->qd_lockref.count == 0) {
+		lockref_mark_dead(&qd->qd_lockref);
+		list_move(&qd->qd_lru, dispose);
+	}
+
+	spin_unlock(&qd->qd_lockref.lock);
+	return LRU_REMOVED;
+}
+
+static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
+					 struct shrink_control *sc)
+{
+	LIST_HEAD(dispose);
+	unsigned long freed;
+
+	if (!(sc->gfp_mask & __GFP_FS))
+		return SHRINK_STOP;
+
+	freed = list_lru_walk_node(&gfs2_qd_lru, sc->nid, gfs2_qd_isolate,
+				   &dispose, &sc->nr_to_scan);
+
+	gfs2_qd_dispose(&dispose);
+
 	return freed;
 }
 
-unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
-				   struct shrink_control *sc)
+static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
+					  struct shrink_control *sc)
 {
-	return vfs_pressure_ratio(atomic_read(&qd_lru_count));
+	return vfs_pressure_ratio(list_lru_count_node(&gfs2_qd_lru, sc->nid));
 }
 
+struct shrinker gfs2_qd_shrinker = {
+	.count_objects = gfs2_qd_shrink_count,
+	.scan_objects = gfs2_qd_shrink_scan,
+	.seeks = DEFAULT_SEEKS,
+	.flags = SHRINKER_NUMA_AWARE,
+};
+
+
 static u64 qd2index(struct gfs2_quota_data *qd)
 {
 	struct kqid qid = qd->qd_id;
@@ -153,7 +185,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid,
 	spin_lock_init(&qd->qd_lockref.lock);
 	qd->qd_id = qid;
 	qd->qd_slot = -1;
-	INIT_LIST_HEAD(&qd->qd_reclaim);
+	INIT_LIST_HEAD(&qd->qd_lru);
 
 	error = gfs2_glock_get(sdp, qd2index(qd),
 			      &gfs2_quota_glops, CREATE, &qd->qd_gl);
@@ -181,13 +213,9 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
 		found = 0;
 		spin_lock(&qd_lock);
 		list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
-			if (qid_eq(qd->qd_id, qid)) {
-				lockref_get(&qd->qd_lockref);
-				if (!list_empty(&qd->qd_reclaim)) {
-					/* Remove it from reclaim list */
-					list_del_init(&qd->qd_reclaim);
-					atomic_dec(&qd_lru_count);
-				}
+			if (qid_eq(qd->qd_id, qid) &&
+			    lockref_get_not_dead(&qd->qd_lockref)) {
+				list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
 				found = 1;
 				break;
 			}
@@ -229,18 +257,13 @@ static void qd_hold(struct gfs2_quota_data *qd)
 
 static void qd_put(struct gfs2_quota_data *qd)
 {
-	spin_lock(&qd_lock);
-
-	if (!lockref_put_or_lock(&qd->qd_lockref)) {
+	if (lockref_put_or_lock(&qd->qd_lockref))
+		return;
 
-		/* Add to the reclaim list */
-		list_add_tail(&qd->qd_reclaim, &qd_lru_list);
-		atomic_inc(&qd_lru_count);
+	qd->qd_lockref.count = 0;
+	list_lru_add(&gfs2_qd_lru, &qd->qd_lru);
+	spin_unlock(&qd->qd_lockref.lock);
 
-		spin_unlock(&qd->qd_lockref.lock);
-	}
-
-	spin_unlock(&qd_lock);
 }
 
 static int slot_get(struct gfs2_quota_data *qd)
@@ -398,11 +421,11 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd,
 	    (sync_gen && (qd->qd_sync_gen >= *sync_gen)))
 		return 0;
 
-	list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
+	if (!lockref_get_not_dead(&qd->qd_lockref))
+		return 0;
 
+	list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
 	set_bit(QDF_LOCKED, &qd->qd_flags);
-	gfs2_assert_warn(sdp, !__lockref_is_dead(&qd->qd_lockref));
-	lockref_get(&qd->qd_lockref);
 	qd->qd_change_sync = qd->qd_change;
 	gfs2_assert_warn(sdp, qd->qd_slot_count);
 	qd->qd_slot_count++;
@@ -1329,10 +1352,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 
 		list_del(&qd->qd_list);
 		/* Also remove if this qd exists in the reclaim list */
-		if (!list_empty(&qd->qd_reclaim)) {
-			list_del_init(&qd->qd_reclaim);
-			atomic_dec(&qd_lru_count);
-		}
+		list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
 		atomic_dec(&sdp->sd_quota_count);
 		spin_unlock(&qd_lock);
 
@@ -1487,7 +1507,7 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
 	}
 	fqs->qs_uquota.qfs_nextents = 1; /* unsupported */
 	fqs->qs_gquota = fqs->qs_uquota; /* its the same inode in both cases */
-	fqs->qs_incoredqs = atomic_read(&qd_lru_count);
+	fqs->qs_incoredqs = list_lru_count(&gfs2_qd_lru);
 	return 0;
 }
 
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 0f64d9deb1b0..96e4f34a03b0 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -10,9 +10,10 @@
 #ifndef __QUOTA_DOT_H__
 #define __QUOTA_DOT_H__
 
+#include <linux/list_lru.h>
+
 struct gfs2_inode;
 struct gfs2_sbd;
-struct shrink_control;
 
 #define NO_UID_QUOTA_CHANGE INVALID_UID
 #define NO_GID_QUOTA_CHANGE INVALID_GID
@@ -53,10 +54,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 	return ret;
 }
 
-extern unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
-					  struct shrink_control *sc);
-extern unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
-					 struct shrink_control *sc);
 extern const struct quotactl_ops gfs2_quotactl_ops;
+extern struct shrinker gfs2_qd_shrinker;
+extern struct list_lru gfs2_qd_lru;
 
 #endif /* __QUOTA_DOT_H__ */
-- 
cgit v1.3


From 18da65e7d30a76ab3ca0ccd2b7ca0690f80f28e4 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 1 Nov 2013 13:21:54 +0300
Subject: quota: info leak in quota_getquota()

The if_dqblk struct has a 4 byte hole at the end of the struct so
uninitialized stack information is leaked to user space.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/quota.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index dea86e8967ee..2b363e23f36e 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -117,6 +117,7 @@ static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
 
 static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src)
 {
+	memset(dst, 0, sizeof(*dst));
 	dst->dqb_bhardlimit = src->d_blk_hardlimit;
 	dst->dqb_bsoftlimit = src->d_blk_softlimit;
 	dst->dqb_curspace = src->d_bcount;
-- 
cgit v1.3


From da0169b3b9a460ecbb571dad6123fe3735b148d1 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 4 Nov 2013 09:58:26 -0500
Subject: ext4: remove unreachable code in ext4_can_extents_be_merged()

Commit ec22ba8e ("ext4: disable merging of uninitialized extents")
ensured that if either extent under consideration is uninit, we
decline to merge, and immediately return.

But right after that test, we test again for an uninit
extent; we can never hit this.  So just remove the impossible
test and associated variable.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
---
 fs/ext4/extents.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 54d52afcdb19..de6d4674ba6e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1666,7 +1666,7 @@ int
 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 				struct ext4_extent *ex2)
 {
-	unsigned short ext1_ee_len, ext2_ee_len, max_len;
+	unsigned short ext1_ee_len, ext2_ee_len;
 
 	/*
 	 * Make sure that both extents are initialized. We don't merge
@@ -1677,11 +1677,6 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 	if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2))
 		return 0;
 
-	if (ext4_ext_is_uninitialized(ex1))
-		max_len = EXT_UNINIT_MAX_LEN;
-	else
-		max_len = EXT_INIT_MAX_LEN;
-
 	ext1_ee_len = ext4_ext_get_actual_len(ex1);
 	ext2_ee_len = ext4_ext_get_actual_len(ex2);
 
@@ -1694,7 +1689,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 	 * as an RO_COMPAT feature, refuse to merge to extents if
 	 * this can result in the top bit of ee_len being set.
 	 */
-	if (ext1_ee_len + ext2_ee_len > max_len)
+	if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
 		return 0;
 #ifdef AGGRESSIVE_TEST
 	if (ext1_ee_len >= 4)
-- 
cgit v1.3


From 273203699f82667296e1f14344c5a5a6c4600470 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 29 Oct 2013 22:11:44 +1100
Subject: xfs: xfs_remove deadlocks due to inverted AGF vs AGI lock ordering

Removing an inode from the namespace involves removing the directory
entry and dropping the link count on the inode. Removing the
directory entry can result in locking an AGF (directory blocks were
freed) and removing a link count can result in placing the inode on
an unlinked list which results in locking an AGI.

The big problem here is that we have an ordering constraint on AGF
and AGI locking - inode allocation locks the AGI, then can allocate
a new extent for new inodes, locking the AGF after the AGI.
Similarly, freeing the inode removes the inode from the unlinked
list, requiring that we lock the AGI first, and then freeing the
inode can result in an inode chunk being freed and hence freeing
disk space requiring that we lock an AGF.

Hence the ordering that is imposed by other parts of the code is AGI
before AGF. This means we cannot remove the directory entry before
we drop the inode reference count and put it on the unlinked list as
this results in a lock order of AGF then AGI, and this can deadlock
against inode allocation and freeing. Therefore we must drop the
link counts before we remove the directory entry.

This is still safe from a transactional point of view - it is not
until we get to xfs_bmap_finish() that we have the possibility of
multiple transactions in this operation. Hence as long as we remove
the directory entry and drop the link count in the first transaction
of the remove operation, there are no transactional constraints on
the ordering here.

Change the ordering of the operations in the xfs_remove() function
to align the ordering of AGI and AGF locking to match that of the
rest of the code.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_inode.c | 72 +++++++++++++++++++++++++++++++++---------------------
 1 file changed, 44 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 326b94dbe159..001aa893ed59 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2404,6 +2404,33 @@ xfs_iunpin_wait(
 		__xfs_iunpin_wait(ip);
 }
 
+/*
+ * Removing an inode from the namespace involves removing the directory entry
+ * and dropping the link count on the inode. Removing the directory entry can
+ * result in locking an AGF (directory blocks were freed) and removing a link
+ * count can result in placing the inode on an unlinked list which results in
+ * locking an AGI.
+ *
+ * The big problem here is that we have an ordering constraint on AGF and AGI
+ * locking - inode allocation locks the AGI, then can allocate a new extent for
+ * new inodes, locking the AGF after the AGI. Similarly, freeing the inode
+ * removes the inode from the unlinked list, requiring that we lock the AGI
+ * first, and then freeing the inode can result in an inode chunk being freed
+ * and hence freeing disk space requiring that we lock an AGF.
+ *
+ * Hence the ordering that is imposed by other parts of the code is AGI before
+ * AGF. This means we cannot remove the directory entry before we drop the inode
+ * reference count and put it on the unlinked list as this results in a lock
+ * order of AGF then AGI, and this can deadlock against inode allocation and
+ * freeing. Therefore we must drop the link counts before we remove the
+ * directory entry.
+ *
+ * This is still safe from a transactional point of view - it is not until we
+ * get to xfs_bmap_finish() that we have the possibility of multiple
+ * transactions in this operation. Hence as long as we remove the directory
+ * entry and drop the link count in the first transaction of the remove
+ * operation, there are no transactional constraints on the ordering here.
+ */
 int
 xfs_remove(
 	xfs_inode_t             *dp,
@@ -2473,6 +2500,7 @@ xfs_remove(
 	/*
 	 * If we're removing a directory perform some additional validation.
 	 */
+	cancel_flags |= XFS_TRANS_ABORT;
 	if (is_dir) {
 		ASSERT(ip->i_d.di_nlink >= 2);
 		if (ip->i_d.di_nlink != 2) {
@@ -2483,31 +2511,16 @@ xfs_remove(
 			error = XFS_ERROR(ENOTEMPTY);
 			goto out_trans_cancel;
 		}
-	}
 
-	xfs_bmap_init(&free_list, &first_block);
-	error = xfs_dir_removename(tp, dp, name, ip->i_ino,
-					&first_block, &free_list, resblks);
-	if (error) {
-		ASSERT(error != ENOENT);
-		goto out_bmap_cancel;
-	}
-	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
-	if (is_dir) {
-		/*
-		 * Drop the link from ip's "..".
-		 */
+		/* Drop the link from ip's "..".  */
 		error = xfs_droplink(tp, dp);
 		if (error)
-			goto out_bmap_cancel;
+			goto out_trans_cancel;
 
-		/*
-		 * Drop the "." link from ip to self.
-		 */
+		/* Drop the "." link from ip to self.  */
 		error = xfs_droplink(tp, ip);
 		if (error)
-			goto out_bmap_cancel;
+			goto out_trans_cancel;
 	} else {
 		/*
 		 * When removing a non-directory we need to log the parent
@@ -2516,20 +2529,24 @@ xfs_remove(
 		 */
 		xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 	}
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
-	/*
-	 * Drop the link from dp to ip.
-	 */
+	/* Drop the link from dp to ip. */
 	error = xfs_droplink(tp, ip);
 	if (error)
-		goto out_bmap_cancel;
+		goto out_trans_cancel;
 
-	/*
-	 * Determine if this is the last link while
-	 * we are in the transaction.
-	 */
+	/* Determine if this is the last link while the inode is locked */
 	link_zero = (ip->i_d.di_nlink == 0);
 
+	xfs_bmap_init(&free_list, &first_block);
+	error = xfs_dir_removename(tp, dp, name, ip->i_ino,
+					&first_block, &free_list, resblks);
+	if (error) {
+		ASSERT(error != ENOENT);
+		goto out_bmap_cancel;
+	}
+
 	/*
 	 * If this is a synchronous mount, make sure that the
 	 * remove transaction goes to disk before returning to
@@ -2559,7 +2576,6 @@ xfs_remove(
 
  out_bmap_cancel:
 	xfs_bmap_cancel(&free_list);
-	cancel_flags |= XFS_TRANS_ABORT;
  out_trans_cancel:
 	xfs_trans_cancel(tp, cancel_flags);
  std_return:
-- 
cgit v1.3


From 3da580aab9482f5bf88ea79a4bf0a511eea6fa44 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 2 Nov 2013 06:57:18 -0400
Subject: nfs: set security label when revalidating inode

Currently, we fetch the security label when revalidating an inode's
attributes, but don't apply it. This is in contrast to the readdir()
codepath where we do apply label changes.

Cc: Dave Quigley <dpquigl@davequigley.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 471ba59c42f9..26e77d81234e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -923,6 +923,8 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
 		nfs_zap_acl_cache(inode);
 
+	nfs_setsecurity(inode, fattr, label);
+
 	dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n",
 		inode->i_sb->s_id,
 		(long long)NFS_FILEID(inode));
-- 
cgit v1.3


From d204c5d2b8f7614350cd4609f4d4cdcf25494d74 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 4 Nov 2013 13:23:59 -0500
Subject: NFSv4.2: encode_readdir - only ask for labels when doing readdirplus

Currently, if the server is doing NFSv4.2 and supports labeled NFS, then
our on-the-wire READDIR request ends up asking for the label information,
which is then ignored unless we're doing readdirplus.
This patch ensures that READDIR doesn't ask the server for label information
at all unless the readdir->bitmask contains the FATTR4_WORD2_SECURITY_LABEL
attribute, and the readdir->plus flag is set.

While we're at it, optimise away the 3rd bitmap field if it is zero.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f903389d90f1..5be2868c02f1 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -105,12 +105,8 @@ static int nfs4_stat_to_errno(int);
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
 /* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */
 #define	nfs4_label_maxsz	(4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN))
-#define encode_readdir_space 24
-#define encode_readdir_bitmask_sz 3
 #else
 #define	nfs4_label_maxsz	0
-#define encode_readdir_space 20
-#define encode_readdir_bitmask_sz 2
 #endif
 /* We support only one layout type per file system */
 #define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8)
@@ -1581,6 +1577,8 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 	};
 	uint32_t dircount = readdir->count >> 1;
 	__be32 *p, verf[2];
+	uint32_t attrlen = 0;
+	unsigned int i;
 
 	if (readdir->plus) {
 		attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
@@ -1589,26 +1587,27 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 			FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|
 			FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
 			FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+		attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
 		dircount >>= 1;
 	}
 	/* Use mounted_on_fileid only if the server supports it */
 	if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID))
 		attrs[0] |= FATTR4_WORD0_FILEID;
+	for (i = 0; i < ARRAY_SIZE(attrs); i++) {
+		attrs[i] &= readdir->bitmask[i];
+		if (attrs[i] != 0)
+			attrlen = i+1;
+	}
 
 	encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr);
 	encode_uint64(xdr, readdir->cookie);
 	encode_nfs4_verifier(xdr, &readdir->verifier);
-	p = reserve_space(xdr, encode_readdir_space);
+	p = reserve_space(xdr, 12 + (attrlen << 2));
 	*p++ = cpu_to_be32(dircount);
 	*p++ = cpu_to_be32(readdir->count);
-	*p++ = cpu_to_be32(encode_readdir_bitmask_sz);
-	*p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
-	*p   = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
-	if (encode_readdir_bitmask_sz > 2) {
-		if (hdr->minorversion > 1)
-			attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
-		p++, *p++ = cpu_to_be32(attrs[2] & readdir->bitmask[2]);
-	}
+	*p++ = cpu_to_be32(attrlen);
+	for (i = 0; i < attrlen; i++)
+		*p++ = cpu_to_be32(attrs[i]);
 	memcpy(verf, readdir->verifier.data, sizeof(verf));
 
 	dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n",
-- 
cgit v1.3


From b944dba31d2c23f1cf5d69a66cc3449e0ba78503 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 4 Nov 2013 15:20:20 -0500
Subject: NFSv4: Sanity check the server reply in _nfs4_server_capabilities

We don't want to be setting capabilities and/or requesting attributes
that are not appropriate for the NFSv4 minor version.

- Ensure that we clear the NFS_CAP_SECURITY_LABEL capability when appropriate
- Ensure that we limit the attribute bitmasks to the mounted_on_fileid
  attribute and less for NFSv4.0
- Ensure that we limit the attribute bitmasks to suppattr_exclcreat and
  less for NFSv4.1
- Ensure that we limit it to change_sec_label or less for NFSv4.2

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c    | 25 ++++++++++++++++++++-----
 include/linux/nfs4.h |  2 ++
 2 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7e28b5c77cbc..ed2a0e6b9aed 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2706,6 +2706,10 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
 		nfs4_close_state(ctx->state, ctx->mode);
 }
 
+#define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL)
+#define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL)
+#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_CHANGE_SECURITY_LABEL - 1UL)
+
 static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
 	struct nfs4_server_caps_arg args = {
@@ -2721,12 +2725,25 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 
 	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
 	if (status == 0) {
+		/* Sanity check the server answers */
+		switch (server->nfs_client->cl_minorversion) {
+		case 0:
+			res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK;
+			res.attr_bitmask[2] = 0;
+			break;
+		case 1:
+			res.attr_bitmask[2] &= FATTR4_WORD2_NFS41_MASK;
+			break;
+		case 2:
+			res.attr_bitmask[2] &= FATTR4_WORD2_NFS42_MASK;
+		}
 		memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
 		server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|
 				NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
 				NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|
 				NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME|
-				NFS_CAP_CTIME|NFS_CAP_MTIME);
+				NFS_CAP_CTIME|NFS_CAP_MTIME|
+				NFS_CAP_SECURITY_LABEL);
 		if (res.attr_bitmask[0] & FATTR4_WORD0_ACL)
 			server->caps |= NFS_CAP_ACLS;
 		if (res.has_links != 0)
@@ -2755,14 +2772,12 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 #endif
 		memcpy(server->attr_bitmask_nl, res.attr_bitmask,
 				sizeof(server->attr_bitmask));
+		server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
 
-		if (server->caps & NFS_CAP_SECURITY_LABEL) {
-			server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
-			res.attr_bitmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
-		}
 		memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
 		server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
 		server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+		server->cache_consistency_bitmask[2] = 0;
 		server->acl_bitmask = res.acl_bitmask;
 		server->fh_expire_type = res.fh_expire_type;
 	}
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index bfe6c379a24e..c6f41b616965 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -396,6 +396,8 @@ enum lock_type4 {
 #define FATTR4_WORD2_LAYOUT_BLKSIZE     (1UL << 1)
 #define FATTR4_WORD2_MDSTHRESHOLD       (1UL << 4)
 #define FATTR4_WORD2_SECURITY_LABEL     (1UL << 16)
+#define FATTR4_WORD2_CHANGE_SECURITY_LABEL \
+					(1UL << 17)
 
 /* MDS threshold bitmap bits */
 #define THRESHOLD_RD                    (1UL << 0)
-- 
cgit v1.3


From fab99ebe39fe7d11fbd9b5fb84f07432af9ba36f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 4 Nov 2013 13:46:40 -0500
Subject: NFSv4.2: Remove redundant checks in
 nfs_setsecurity+nfs4_label_init_security

We already check for nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)
in nfs4_label_alloc()
We check the minor version in _nfs4_server_capabilities before setting
NFS_CAP_SECURITY_LABEL.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c    | 6 ------
 fs/nfs/nfs4proc.c | 3 ---
 2 files changed, 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 26e77d81234e..18ab2da4eeb6 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -274,12 +274,6 @@ void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
 	if (label == NULL)
 		return;
 
-	if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL) == 0)
-		return;
-
-	if (NFS_SERVER(inode)->nfs_client->cl_minorversion < 2)
-		return;
-
 	if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) {
 		error = security_inode_notifysecctx(inode, label->label,
 				label->len);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ed2a0e6b9aed..5ab33c0792df 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -105,9 +105,6 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
 	if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0)
 		return NULL;
 
-	if (NFS_SERVER(dir)->nfs_client->cl_minorversion < 2)
-		return NULL;
-
 	err = security_dentry_init_security(dentry, sattr->ia_mode,
 				&dentry->d_name, (void **)&label->label, &label->len);
 	if (err == 0)
-- 
cgit v1.3


From b78800baee494eaca1d8a9ca2d490704fcc49fa0 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 4 Nov 2013 17:46:04 -0500
Subject: Revert "nfsd: remove_stid can be incorporated into
 nfs4_put_delegation"

This reverts commit 7ebe40f20372688a627ad6c754bc0d1c05df58a9.  We forgot
the nfs4_put_delegation call in fs/nfsd/nfs4callback.c which should not
be unhashing the stateid.  This lead to warnings from the idr code when
we tried to removed id's twice.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e03e8efff53f..0cceb99705c9 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -410,7 +410,6 @@ static void nfs4_free_stid(struct kmem_cache *slab, struct nfs4_stid *s)
 void
 nfs4_put_delegation(struct nfs4_delegation *dp)
 {
-	remove_stid(&dp->dl_stid);
 	if (atomic_dec_and_test(&dp->dl_count)) {
 		nfs4_free_stid(deleg_slab, &dp->dl_stid);
 		num_delegations--;
@@ -451,12 +450,14 @@ unhash_delegation(struct nfs4_delegation *dp)
 static void destroy_revoked_delegation(struct nfs4_delegation *dp)
 {
 	list_del_init(&dp->dl_recall_lru);
+	remove_stid(&dp->dl_stid);
 	nfs4_put_delegation(dp);
 }
 
 static void destroy_delegation(struct nfs4_delegation *dp)
 {
 	unhash_delegation(dp);
+	remove_stid(&dp->dl_stid);
 	nfs4_put_delegation(dp);
 }
 
@@ -3161,6 +3162,7 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
 	open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
 	return;
 out_free:
+	remove_stid(&dp->dl_stid);
 	nfs4_put_delegation(dp);
 out_no_deleg:
 	open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE;
-- 
cgit v1.3


From f6011081f5e290756bd90fe96f1e86d3eac76f77 Mon Sep 17 00:00:00 2001
From: Maxim Patlasov <MPatlasov@parallels.com>
Date: Wed, 2 Oct 2013 15:01:07 +0400
Subject: fuse: writepages: roll back changes if request not found

fuse_writepage_in_flight() returns false if it fails to find request with
given index in fi->writepages.  Then the caller proceeds with populating
data->orig_pages[] and incrementing req->num_pages.  Hence,
fuse_writepage_in_flight() must revert changes it made in request before
returning false.

Signed-off-by: Maxim Patlasov <MPatlasov@parallels.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 135360e2b9e5..077b03844998 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1649,7 +1649,6 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
 
 	spin_lock(&fc->lock);
 	list_del(&new_req->writepages_entry);
-	new_req->num_pages = 1;
 	list_for_each_entry(old_req, &fi->writepages, writepages_entry) {
 		BUG_ON(old_req->inode != new_req->inode);
 		curr_index = old_req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
@@ -1659,9 +1658,12 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
 			break;
 		}
 	}
-	if (!found)
+	if (!found) {
+		list_add(&new_req->writepages_entry, &fi->writepages);
 		goto out_unlock;
+	}
 
+	new_req->num_pages = 1;
 	for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) {
 		BUG_ON(tmp->inode != new_req->inode);
 		curr_index = tmp->misc.write.in.offset >> PAGE_CACHE_SHIFT;
-- 
cgit v1.3


From 6eaf4782eb09e28dbd13d23b9ce0fb7646daf37e Mon Sep 17 00:00:00 2001
From: Maxim Patlasov <MPatlasov@parallels.com>
Date: Wed, 2 Oct 2013 21:38:32 +0400
Subject: fuse: writepages: crop secondary requests

If writeback happens while fuse is in FUSE_NOWRITE condition, the request
will be queued but not processed immediately (see fuse_flush_writepages()).
Until FUSE_NOWRITE becomes relaxed, more writebacks can happen.  They will
be queued as "secondary" requests to that first ("primary") request.

Existing implementation crops only primary request.  This is not correct
because a subsequent extending write(2) may increase i_size and then
secondary requests won't be cropped properly.  The result would be stale
data written to the server to a file offset where zeros must be.

Similar problem may happen if secondary requests are attached to an
in-flight request that was already cropped.

The patch solves the issue by cropping all secondary requests in
fuse_writepage_end().  Thanks to Miklos for idea.

Signed-off-by: Maxim Patlasov <MPatlasov@parallels.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 077b03844998..1cb303eba4d0 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1436,12 +1436,12 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 }
 
 /* Called under fc->lock, may release and reacquire it */
-static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req,
+				loff_t size)
 __releases(fc->lock)
 __acquires(fc->lock)
 {
 	struct fuse_inode *fi = get_fuse_inode(req->inode);
-	loff_t size = i_size_read(req->inode);
 	struct fuse_write_in *inarg = &req->misc.write.in;
 	__u64 data_size = req->num_pages * PAGE_CACHE_SIZE;
 
@@ -1482,12 +1482,13 @@ __acquires(fc->lock)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
+	size_t crop = i_size_read(inode);
 	struct fuse_req *req;
 
 	while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
 		req = list_entry(fi->queued_writes.next, struct fuse_req, list);
 		list_del_init(&req->list);
-		fuse_send_writepage(fc, req);
+		fuse_send_writepage(fc, req, crop);
 	}
 }
 
@@ -1499,12 +1500,37 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
 	mapping_set_error(inode->i_mapping, req->out.h.error);
 	spin_lock(&fc->lock);
 	while (req->misc.write.next) {
+		struct fuse_conn *fc = get_fuse_conn(inode);
+		struct fuse_write_in *inarg = &req->misc.write.in;
 		struct fuse_req *next = req->misc.write.next;
 		req->misc.write.next = next->misc.write.next;
 		next->misc.write.next = NULL;
 		list_add(&next->writepages_entry, &fi->writepages);
-		list_add_tail(&next->list, &fi->queued_writes);
-		fuse_flush_writepages(inode);
+
+		/*
+		 * Skip fuse_flush_writepages() to make it easy to crop requests
+		 * based on primary request size.
+		 *
+		 * 1st case (trivial): there are no concurrent activities using
+		 * fuse_set/release_nowrite.  Then we're on safe side because
+		 * fuse_flush_writepages() would call fuse_send_writepage()
+		 * anyway.
+		 *
+		 * 2nd case: someone called fuse_set_nowrite and it is waiting
+		 * now for completion of all in-flight requests.  This happens
+		 * rarely and no more than once per page, so this should be
+		 * okay.
+		 *
+		 * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle
+		 * of fuse_set_nowrite..fuse_release_nowrite section.  The fact
+		 * that fuse_set_nowrite returned implies that all in-flight
+		 * requests were completed along with all of their secondary
+		 * requests.  Further primary requests are blocked by negative
+		 * writectr.  Hence there cannot be any in-flight requests and
+		 * no invocations of fuse_writepage_end() while we're in
+		 * fuse_set_nowrite..fuse_release_nowrite section.
+		 */
+		fuse_send_writepage(fc, next, inarg->offset + inarg->size);
 	}
 	fi->writectr--;
 	fuse_writepage_finish(fc, req);
-- 
cgit v1.3


From 41b6e41fc609753a9386d24295f7ed03b28c4601 Mon Sep 17 00:00:00 2001
From: Maxim Patlasov <MPatlasov@parallels.com>
Date: Wed, 2 Oct 2013 21:38:43 +0400
Subject: fuse: writepages: update bdi writeout when deleting secondary request

BDI_WRITTEN counter is used to estimate bdi bandwidth.  It must be
incremented every time as bdi ends page writeback.  No matter whether it
was fulfilled by actual write or by discarding the request (e.g. due to
shrunk i_size).

Note that even before writepages patches, the case "Got truncated off
completely" was handled in fuse_send_writepage() by calling
fuse_writepage_finish() which updated BDI_WRITTEN unconditionally.

Signed-off-by: Maxim Patlasov <MPatlasov@parallels.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 1cb303eba4d0..e6fdd59a7906 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1701,11 +1701,14 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
 
 	if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
 					old_req->state == FUSE_REQ_PENDING)) {
+		struct backing_dev_info *bdi = page->mapping->backing_dev_info;
+
 		copy_highpage(old_req->pages[0], page);
 		spin_unlock(&fc->lock);
 
-		dec_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK);
+		dec_bdi_stat(bdi, BDI_WRITEBACK);
 		dec_zone_page_state(page, NR_WRITEBACK_TEMP);
+		bdi_writeout_inc(bdi);
 		fuse_writepage_free(fc, new_req);
 		fuse_request_free(new_req);
 		goto out;
-- 
cgit v1.3


From ce128de6260f86a990ed44a697f26d0859684f28 Mon Sep 17 00:00:00 2001
From: Maxim Patlasov <MPatlasov@parallels.com>
Date: Wed, 2 Oct 2013 21:38:54 +0400
Subject: fuse: writepages: protect secondary requests from fuse file release

All async fuse requests must be supplied with extra reference to a fuse
file.  This is necessary to ensure that the fuse file is not released until
all in-flight requests are completed.  Fuse secondary writeback requests
must obey this rule as well.

Signed-off-by: Maxim Patlasov <MPatlasov@parallels.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e6fdd59a7906..7e70506297bc 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1505,6 +1505,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
 		struct fuse_req *next = req->misc.write.next;
 		req->misc.write.next = next->misc.write.next;
 		next->misc.write.next = NULL;
+		next->ff = fuse_file_get(req->ff);
 		list_add(&next->writepages_entry, &fi->writepages);
 
 		/*
-- 
cgit v1.3


From 7ba3ec5749ddb61f79f7be17b5fd7720eebc52de Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 5 Nov 2013 01:15:38 +0100
Subject: ext2: Fix fs corruption in ext2_get_xip_mem()

Commit 8e3dffc651cb "Ext2: mark inode dirty after the function
dquot_free_block_nodirty is called" unveiled a bug in __ext2_get_block()
called from ext2_get_xip_mem(). That function called ext2_get_block()
mistakenly asking it to map 0 blocks while 1 was intended. Before the
above mentioned commit things worked out fine by luck but after that commit
we started returning that we allocated 0 blocks while we in fact
allocated 1 block and thus allocation was looping until all blocks in
the filesystem were exhausted.

Fix the problem by properly asking for one block and also add assertion
in ext2_get_blocks() to catch similar problems.

Reported-and-tested-by: Andiry Xu <andiry.xu@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/inode.c | 2 ++
 fs/ext2/xip.c   | 1 +
 2 files changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index c260de6d7b6d..8a337640a46a 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -632,6 +632,8 @@ static int ext2_get_blocks(struct inode *inode,
 	int count = 0;
 	ext2_fsblk_t first_block = 0;
 
+	BUG_ON(maxblocks == 0);
+
 	depth = ext2_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
 
 	if (depth == 0)
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index 1c3312858fcf..e98171a11cfe 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -35,6 +35,7 @@ __ext2_get_block(struct inode *inode, pgoff_t pgoff, int create,
 	int rc;
 
 	memset(&tmp, 0, sizeof(struct buffer_head));
+	tmp.b_size = 1 << inode->i_blkbits;
 	rc = ext2_get_block(inode, pgoff, &tmp, create);
 	*result = tmp.b_blocknr;
 
-- 
cgit v1.3


From 81407c84ace88368ff23abb81caaeacf050c8450 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Fri, 24 May 2013 09:49:14 -0400
Subject: audit: allow unsetting the loginuid (with priv)

If a task has CAP_AUDIT_CONTROL allow that task to unset their loginuid.
This would allow a child of that task to set their loginuid without
CAP_AUDIT_CONTROL.  Thus when launching a new login daemon, a
priviledged helper would be able to unset the loginuid and then the
daemon, which may be malicious user facing, do not need priv to function
correctly.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/proc/base.c   | 14 ++++++++++----
 kernel/auditsc.c |  4 +++-
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1485e38daaa3..03c8d747be48 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1151,10 +1151,16 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 		goto out_free_page;
 
 	}
-	kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
-	if (!uid_valid(kloginuid)) {
-		length = -EINVAL;
-		goto out_free_page;
+
+	/* is userspace tring to explicitly UNSET the loginuid? */
+	if (loginuid == AUDIT_UID_UNSET) {
+		kloginuid = INVALID_UID;
+	} else {
+		kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
+		if (!uid_valid(kloginuid)) {
+			length = -EINVAL;
+			goto out_free_page;
+		}
 	}
 
 	length = audit_set_loginuid(kloginuid);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b55788bf1607..c75d7813aef2 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2019,7 +2019,9 @@ int audit_set_loginuid(kuid_t loginuid)
 	if (rc)
 		goto out;
 
-	sessionid = atomic_inc_return(&session_id);
+	/* are we setting or clearing? */
+	if (uid_valid(loginuid))
+		sessionid = atomic_inc_return(&session_id);
 
 	task->sessionid = sessionid;
 	task->loginuid = loginuid;
-- 
cgit v1.3


From 14e972b4517128ac8e30e3de2ee4fbd995084223 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 8 May 2013 10:25:58 -0400
Subject: audit: add child record before the create to handle case where create
 fails

Historically, when a syscall that creates a dentry fails, you get an audit
record that looks something like this (when trying to create a file named
"new" in "/tmp/tmp.SxiLnCcv63"):

    type=PATH msg=audit(1366128956.279:965): item=0 name="/tmp/tmp.SxiLnCcv63/new" inode=2138308 dev=fd:02 mode=040700 ouid=0 ogid=0 rdev=00:00 obj=staff_u:object_r:user_tmp_t:s15:c0.c1023

This record makes no sense since it's associating the inode information for
"/tmp/tmp.SxiLnCcv63" with the path "/tmp/tmp.SxiLnCcv63/new". The recent
patch I posted to fix the audit_inode call in do_last fixes this, by making it
look more like this:

    type=PATH msg=audit(1366128765.989:13875): item=0 name="/tmp/tmp.DJ1O8V3e4f/" inode=141 dev=fd:02 mode=040700 ouid=0 ogid=0 rdev=00:00 obj=staff_u:object_r:user_tmp_t:s15:c0.c1023

While this is more correct, if the creation of the file fails, then we
have no record of the filename that the user tried to create.

This patch adds a call to audit_inode_child to may_create. This creates
an AUDIT_TYPE_CHILD_CREATE record that will sit in place until the
create succeeds. When and if the create does succeed, then this record
will be updated with the correct inode info from the create.

This fixes what was broken in commit bfcec708.
Commit 79f6530c should also be backported to stable v3.7+.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/namei.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 7720fbd5277b..df9946e83db4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2262,6 +2262,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
  */
 static inline int may_create(struct inode *dir, struct dentry *child)
 {
+	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
 	if (child->d_inode)
 		return -EEXIST;
 	if (IS_DEADDIR(dir))
-- 
cgit v1.3


From 9410d228a4cf434305306746bb799fb7acdd8648 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Wed, 30 Oct 2013 18:05:24 -0400
Subject: audit: call audit_bprm() only once to add AUDIT_EXECVE information

Move the audit_bprm() call from search_binary_handler() to exec_binprm().  This
allows us to get rid of the mm member of struct audit_aux_data_execve since
bprm->mm will equal current->mm.

This also mitigates the issue that ->argc could be modified by the
load_binary() call in search_binary_handler().

audit_bprm() was being called to add an AUDIT_EXECVE record to the audit
context every time search_binary_handler() was recursively called.  Only one
reference is necessary.

Reported-by: Oleg Nesterov <onestero@redhat.com>
Cc: Eric Paris <eparis@redhat.com>
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
This patch is against 3.11, but was developed on Oleg's post-3.11 patches that
introduce exec_binprm().
---
 fs/exec.c             | 5 +----
 include/linux/audit.h | 9 +++------
 kernel/audit.h        | 1 -
 kernel/auditsc.c      | 4 ----
 4 files changed, 4 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index fd774c7cb483..c5c24f2fc44a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1383,10 +1383,6 @@ int search_binary_handler(struct linux_binprm *bprm)
 	if (retval)
 		return retval;
 
-	retval = audit_bprm(bprm);
-	if (retval)
-		return retval;
-
 	/* Need to fetch pid before load_binary changes it */
 	old_pid = current->pid;
 	rcu_read_lock();
@@ -1408,6 +1404,7 @@ int search_binary_handler(struct linux_binprm *bprm)
 			bprm->recursion_depth = depth;
 			if (retval >= 0) {
 				if (depth == 0) {
+					audit_bprm(bprm);
 					trace_sched_process_exec(current, old_pid, bprm);
 					ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
 				}
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 08b38bf13eb9..a40641954c29 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -238,11 +238,10 @@ static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid
 	if (unlikely(!audit_dummy_context()))
 		__audit_ipc_set_perm(qbytes, uid, gid, mode);
 }
-static inline int audit_bprm(struct linux_binprm *bprm)
+static inline void audit_bprm(struct linux_binprm *bprm)
 {
 	if (unlikely(!audit_dummy_context()))
 		__audit_bprm(bprm);
-	return 0;
 }
 static inline int audit_socketcall(int nargs, unsigned long *args)
 {
@@ -369,10 +368,8 @@ static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
 static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
 					gid_t gid, umode_t mode)
 { }
-static inline int audit_bprm(struct linux_binprm *bprm)
-{
-	return 0;
-}
+static inline void audit_bprm(struct linux_binprm *bprm)
+{ }
 static inline int audit_socketcall(int nargs, unsigned long *args)
 {
 	return 0;
diff --git a/kernel/audit.h b/kernel/audit.h
index e7b94ab66c49..b779642b29af 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -199,7 +199,6 @@ struct audit_context {
 		} mmap;
 		struct {
 			int			argc;
-			struct mm_struct	*mm;
 		} execve;
 	};
 	int fds[2];
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 425a8939be1a..dfc5d6745ee5 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1145,9 +1145,6 @@ static void audit_log_execve_info(struct audit_context *context,
 	const char __user *p;
 	char *buf;
 
-	if (context->execve.mm != current->mm)
-		return; /* execve failed, no additional info */
-
 	p = (const char __user *)current->mm->arg_start;
 
 	audit_log_format(*ab, "argc=%d", context->execve.argc);
@@ -2144,7 +2141,6 @@ void __audit_bprm(struct linux_binprm *bprm)
 
 	context->type = AUDIT_EXECVE;
 	context->execve.argc = bprm->argc;
-	context->execve.mm = bprm->mm;
 }
 
 
-- 
cgit v1.3


From 3b03f72445ba1437cfa29f9719bb3cfdb60558d9 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Wed, 6 Nov 2013 09:12:04 +0800
Subject: f2fs: avoid to use a NULL point in destroy_segment_manager

A NULL point should avoid to be used in destroy_segment_manager after allocating
memory fail for f2fs_sm_info.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/segment.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 3d4d5fc19e6f..ff363e686b7e 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1744,6 +1744,8 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
 void destroy_segment_manager(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_sm_info *sm_info = SM_I(sbi);
+	if (!sm_info)
+		return;
 	destroy_dirty_segmap(sbi);
 	destroy_curseg(sbi);
 	destroy_free_segmap(sbi);
-- 
cgit v1.3


From 1ca7d67cf5d5a2aef26a8d9afd789006fa098347 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Mon, 7 Oct 2013 15:51:59 -0700
Subject: seqcount: Add lockdep functionality to seqcount/seqlock structures

Currently seqlocks and seqcounts don't support lockdep.

After running across a seqcount related deadlock in the timekeeping
code, I used a less-refined and more focused variant of this patch
to narrow down the cause of the issue.

This is a first-pass attempt to properly enable lockdep functionality
on seqlocks and seqcounts.

Since seqcounts are used in the vdso gettimeofday code, I've provided
non-lockdep accessors for those needs.

I've also handled one case where there were nested seqlock writers
and there may be more edge cases.

Comments and feedback would be appreciated!

Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: netdev@vger.kernel.org
Link: http://lkml.kernel.org/r/1381186321-4906-3-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/vdso/vclock_gettime.c |  8 ++---
 fs/dcache.c                    |  4 +--
 fs/fs_struct.c                 |  2 +-
 include/linux/init_task.h      |  8 ++---
 include/linux/lockdep.h        |  8 +++--
 include/linux/seqlock.h        | 79 ++++++++++++++++++++++++++++++++++++++----
 mm/filemap_xip.c               |  2 +-
 7 files changed, 90 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 72074d528400..2ada505067cc 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -178,7 +178,7 @@ notrace static int __always_inline do_realtime(struct timespec *ts)
 
 	ts->tv_nsec = 0;
 	do {
-		seq = read_seqcount_begin(&gtod->seq);
+		seq = read_seqcount_begin_no_lockdep(&gtod->seq);
 		mode = gtod->clock.vclock_mode;
 		ts->tv_sec = gtod->wall_time_sec;
 		ns = gtod->wall_time_snsec;
@@ -198,7 +198,7 @@ notrace static int do_monotonic(struct timespec *ts)
 
 	ts->tv_nsec = 0;
 	do {
-		seq = read_seqcount_begin(&gtod->seq);
+		seq = read_seqcount_begin_no_lockdep(&gtod->seq);
 		mode = gtod->clock.vclock_mode;
 		ts->tv_sec = gtod->monotonic_time_sec;
 		ns = gtod->monotonic_time_snsec;
@@ -214,7 +214,7 @@ notrace static int do_realtime_coarse(struct timespec *ts)
 {
 	unsigned long seq;
 	do {
-		seq = read_seqcount_begin(&gtod->seq);
+		seq = read_seqcount_begin_no_lockdep(&gtod->seq);
 		ts->tv_sec = gtod->wall_time_coarse.tv_sec;
 		ts->tv_nsec = gtod->wall_time_coarse.tv_nsec;
 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
@@ -225,7 +225,7 @@ notrace static int do_monotonic_coarse(struct timespec *ts)
 {
 	unsigned long seq;
 	do {
-		seq = read_seqcount_begin(&gtod->seq);
+		seq = read_seqcount_begin_no_lockdep(&gtod->seq);
 		ts->tv_sec = gtod->monotonic_time_coarse.tv_sec;
 		ts->tv_nsec = gtod->monotonic_time_coarse.tv_nsec;
 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
diff --git a/fs/dcache.c b/fs/dcache.c
index ae6ebb88ceff..f750be22c08c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2574,7 +2574,7 @@ static void __d_move(struct dentry * dentry, struct dentry * target)
 	dentry_lock_for_move(dentry, target);
 
 	write_seqcount_begin(&dentry->d_seq);
-	write_seqcount_begin(&target->d_seq);
+	write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);
 
 	/* __d_drop does write_seqcount_barrier, but they're OK to nest. */
 
@@ -2706,7 +2706,7 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 	dentry_lock_for_move(anon, dentry);
 
 	write_seqcount_begin(&dentry->d_seq);
-	write_seqcount_begin(&anon->d_seq);
+	write_seqcount_begin_nested(&anon->d_seq, DENTRY_D_LOCK_NESTED);
 
 	dparent = dentry->d_parent;
 
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index d8ac61d0c932..7dca743b2ce1 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -161,6 +161,6 @@ EXPORT_SYMBOL(current_umask);
 struct fs_struct init_fs = {
 	.users		= 1,
 	.lock		= __SPIN_LOCK_UNLOCKED(init_fs.lock),
-	.seq		= SEQCNT_ZERO,
+	.seq		= SEQCNT_ZERO(init_fs.seq),
 	.umask		= 0022,
 };
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 5cd0f0949927..b0ed422e4e4a 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -32,10 +32,10 @@ extern struct fs_struct init_fs;
 #endif
 
 #ifdef CONFIG_CPUSETS
-#define INIT_CPUSET_SEQ							\
-	.mems_allowed_seq = SEQCNT_ZERO,
+#define INIT_CPUSET_SEQ(tsk)							\
+	.mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq),
 #else
-#define INIT_CPUSET_SEQ
+#define INIT_CPUSET_SEQ(tsk)
 #endif
 
 #define INIT_SIGNALS(sig) {						\
@@ -220,7 +220,7 @@ extern struct task_group root_task_group;
 	INIT_FTRACE_GRAPH						\
 	INIT_TRACE_RECURSION						\
 	INIT_TASK_RCU_PREEMPT(tsk)					\
-	INIT_CPUSET_SEQ							\
+	INIT_CPUSET_SEQ(tsk)						\
 	INIT_VTIME(tsk)							\
 }
 
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index cfc2f119779a..92b1bfc5da60 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -497,6 +497,10 @@ static inline void print_irqtrace_events(struct task_struct *curr)
 #define rwlock_acquire_read(l, s, t, i)		lock_acquire_shared_recursive(l, s, t, NULL, i)
 #define rwlock_release(l, n, i)			lock_release(l, n, i)
 
+#define seqcount_acquire(l, s, t, i)		lock_acquire_exclusive(l, s, t, NULL, i)
+#define seqcount_acquire_read(l, s, t, i)	lock_acquire_shared_recursive(l, s, t, NULL, i)
+#define seqcount_release(l, n, i)		lock_release(l, n, i)
+
 #define mutex_acquire(l, s, t, i)		lock_acquire_exclusive(l, s, t, NULL, i)
 #define mutex_acquire_nest(l, s, t, n, i)	lock_acquire_exclusive(l, s, t, n, i)
 #define mutex_release(l, n, i)			lock_release(l, n, i)
@@ -504,11 +508,11 @@ static inline void print_irqtrace_events(struct task_struct *curr)
 #define rwsem_acquire(l, s, t, i)		lock_acquire_exclusive(l, s, t, NULL, i)
 #define rwsem_acquire_nest(l, s, t, n, i)	lock_acquire_exclusive(l, s, t, n, i)
 #define rwsem_acquire_read(l, s, t, i)		lock_acquire_shared(l, s, t, NULL, i)
-# define rwsem_release(l, n, i)			lock_release(l, n, i)
+#define rwsem_release(l, n, i)			lock_release(l, n, i)
 
 #define lock_map_acquire(l)			lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_)
 #define lock_map_acquire_read(l)		lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_)
-# define lock_map_release(l)			lock_release(l, 1, _THIS_IP_)
+#define lock_map_release(l)			lock_release(l, 1, _THIS_IP_)
 
 #ifdef CONFIG_PROVE_LOCKING
 # define might_lock(lock) 						\
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 21a209336e79..1e8a8b6e837d 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -34,6 +34,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/preempt.h>
+#include <linux/lockdep.h>
 #include <asm/processor.h>
 
 /*
@@ -44,10 +45,50 @@
  */
 typedef struct seqcount {
 	unsigned sequence;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif
 } seqcount_t;
 
-#define SEQCNT_ZERO { 0 }
-#define seqcount_init(x)	do { *(x) = (seqcount_t) SEQCNT_ZERO; } while (0)
+static inline void __seqcount_init(seqcount_t *s, const char *name,
+					  struct lock_class_key *key)
+{
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	lockdep_init_map(&s->dep_map, name, key, 0);
+	s->sequence = 0;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define SEQCOUNT_DEP_MAP_INIT(lockname) \
+		.dep_map = { .name = #lockname } \
+
+# define seqcount_init(s)				\
+	do {						\
+		static struct lock_class_key __key;	\
+		__seqcount_init((s), #s, &__key);	\
+	} while (0)
+
+static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
+{
+	seqcount_t *l = (seqcount_t *)s;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	seqcount_acquire_read(&l->dep_map, 0, 0, _RET_IP_);
+	seqcount_release(&l->dep_map, 1, _RET_IP_);
+	local_irq_restore(flags);
+}
+
+#else
+# define SEQCOUNT_DEP_MAP_INIT(lockname)
+# define seqcount_init(s) __seqcount_init(s, NULL, NULL)
+# define seqcount_lockdep_reader_access(x)
+#endif
+
+#define SEQCNT_ZERO(lockname) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(lockname)}
+
 
 /**
  * __read_seqcount_begin - begin a seq-read critical section (without barrier)
@@ -75,6 +116,22 @@ repeat:
 	return ret;
 }
 
+/**
+ * read_seqcount_begin_no_lockdep - start seq-read critical section w/o lockdep
+ * @s: pointer to seqcount_t
+ * Returns: count to be passed to read_seqcount_retry
+ *
+ * read_seqcount_begin_no_lockdep opens a read critical section of the given
+ * seqcount, but without any lockdep checking. Validity of the critical
+ * section is tested by checking read_seqcount_retry function.
+ */
+static inline unsigned read_seqcount_begin_no_lockdep(const seqcount_t *s)
+{
+	unsigned ret = __read_seqcount_begin(s);
+	smp_rmb();
+	return ret;
+}
+
 /**
  * read_seqcount_begin - begin a seq-read critical section
  * @s: pointer to seqcount_t
@@ -86,9 +143,8 @@ repeat:
  */
 static inline unsigned read_seqcount_begin(const seqcount_t *s)
 {
-	unsigned ret = __read_seqcount_begin(s);
-	smp_rmb();
-	return ret;
+	seqcount_lockdep_reader_access(s);
+	return read_seqcount_begin_no_lockdep(s);
 }
 
 /**
@@ -108,6 +164,8 @@ static inline unsigned read_seqcount_begin(const seqcount_t *s)
 static inline unsigned raw_seqcount_begin(const seqcount_t *s)
 {
 	unsigned ret = ACCESS_ONCE(s->sequence);
+
+	seqcount_lockdep_reader_access(s);
 	smp_rmb();
 	return ret & ~1;
 }
@@ -152,14 +210,21 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
  * Sequence counter only version assumes that callers are using their
  * own mutexing.
  */
-static inline void write_seqcount_begin(seqcount_t *s)
+static inline void write_seqcount_begin_nested(seqcount_t *s, int subclass)
 {
 	s->sequence++;
 	smp_wmb();
+	seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_);
+}
+
+static inline void write_seqcount_begin(seqcount_t *s)
+{
+	write_seqcount_begin_nested(s, 0);
 }
 
 static inline void write_seqcount_end(seqcount_t *s)
 {
+	seqcount_release(&s->dep_map, 1, _RET_IP_);
 	smp_wmb();
 	s->sequence++;
 }
@@ -188,7 +253,7 @@ typedef struct {
  */
 #define __SEQLOCK_UNLOCKED(lockname)			\
 	{						\
-		.seqcount = SEQCNT_ZERO,		\
+		.seqcount = SEQCNT_ZERO(lockname),	\
 		.lock =	__SPIN_LOCK_UNLOCKED(lockname)	\
 	}
 
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 28fe26b64f8a..d8d9fe3f685c 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -26,7 +26,7 @@
  * of ZERO_PAGE(), such as /dev/zero
  */
 static DEFINE_MUTEX(xip_sparse_mutex);
-static seqcount_t xip_sparse_seq = SEQCNT_ZERO;
+static seqcount_t xip_sparse_seq = SEQCNT_ZERO(xip_sparse_seq);
 static struct page *__xip_sparse_page;
 
 /* called under xip_sparse_mutex */
-- 
cgit v1.3


From 750b9c90668b173a92b20e747b9736b8537eda5a Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 1 Nov 2013 15:27:18 +1100
Subject: xfs: trace AIL manipulations

I debugging a log tail issue on a RHEL6 kernel, I added these trace
points to trace log items being added, moved and removed in the AIL
and how that affected the log tail LSN that was written to the log.
They were very helpful in that they immediately identified the cause
of the problem being seen. Hence I'd like to always have them
available for use.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_log.c       |  1 +
 fs/xfs/xfs_trace.h     | 59 +++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_trans_ail.c |  3 +++
 3 files changed, 62 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 49dd41e6a2dc..8497a00e399d 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1076,6 +1076,7 @@ xlog_assign_tail_lsn_locked(
 		tail_lsn = lip->li_lsn;
 	else
 		tail_lsn = atomic64_read(&log->l_last_sync_lsn);
+	trace_xfs_log_assign_tail_lsn(log, tail_lsn);
 	atomic64_set(&log->l_tail_lsn, tail_lsn);
 	return tail_lsn;
 }
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 47910e638c18..f195476a037c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -31,8 +31,8 @@ struct xfs_da_args;
 struct xfs_da_node_entry;
 struct xfs_dquot;
 struct xfs_log_item;
-struct xlog_ticket;
 struct xlog;
+struct xlog_ticket;
 struct xlog_recover;
 struct xlog_recover_item;
 struct xfs_buf_log_format;
@@ -938,6 +938,63 @@ DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
 DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
 DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing);
 
+DECLARE_EVENT_CLASS(xfs_ail_class,
+	TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn),
+	TP_ARGS(lip, old_lsn, new_lsn),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(void *, lip)
+		__field(uint, type)
+		__field(uint, flags)
+		__field(xfs_lsn_t, old_lsn)
+		__field(xfs_lsn_t, new_lsn)
+	),
+	TP_fast_assign(
+		__entry->dev = lip->li_mountp->m_super->s_dev;
+		__entry->lip = lip;
+		__entry->type = lip->li_type;
+		__entry->flags = lip->li_flags;
+		__entry->old_lsn = old_lsn;
+		__entry->new_lsn = new_lsn;
+	),
+	TP_printk("dev %d:%d lip 0x%p old lsn %d/%d new lsn %d/%d type %s flags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->lip,
+		  CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn),
+		  CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn),
+		  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+		  __print_flags(__entry->flags, "|", XFS_LI_FLAGS))
+)
+
+#define DEFINE_AIL_EVENT(name) \
+DEFINE_EVENT(xfs_ail_class, name, \
+	TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), \
+	TP_ARGS(lip, old_lsn, new_lsn))
+DEFINE_AIL_EVENT(xfs_ail_insert);
+DEFINE_AIL_EVENT(xfs_ail_move);
+DEFINE_AIL_EVENT(xfs_ail_delete);
+
+TRACE_EVENT(xfs_log_assign_tail_lsn,
+	TP_PROTO(struct xlog *log, xfs_lsn_t new_lsn),
+	TP_ARGS(log, new_lsn),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_lsn_t, new_lsn)
+		__field(xfs_lsn_t, old_lsn)
+		__field(xfs_lsn_t, last_sync_lsn)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->new_lsn = new_lsn;
+		__entry->old_lsn = atomic64_read(&log->l_tail_lsn);
+		__entry->last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
+	),
+	TP_printk("dev %d:%d new tail lsn %d/%d, old lsn %d/%d, last sync %d/%d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn),
+		  CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn),
+		  CYCLE_LSN(__entry->last_sync_lsn), BLOCK_LSN(__entry->last_sync_lsn))
+)
 
 DECLARE_EVENT_CLASS(xfs_file_class,
 	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 4b47cfebd25b..a7287354e535 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -659,11 +659,13 @@ xfs_trans_ail_update_bulk(
 			if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
 				continue;
 
+			trace_xfs_ail_move(lip, lip->li_lsn, lsn);
 			xfs_ail_delete(ailp, lip);
 			if (mlip == lip)
 				mlip_changed = 1;
 		} else {
 			lip->li_flags |= XFS_LI_IN_AIL;
+			trace_xfs_ail_insert(lip, 0, lsn);
 		}
 		lip->li_lsn = lsn;
 		list_add(&lip->li_ail, &tmp);
@@ -732,6 +734,7 @@ xfs_trans_ail_delete_bulk(
 			return;
 		}
 
+		trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
 		xfs_ail_delete(ailp, lip);
 		lip->li_flags &= ~XFS_LI_IN_AIL;
 		lip->li_lsn = 0;
-- 
cgit v1.3


From d123031a5673cd38a85ce66cc07243dfe5f424c9 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 1 Nov 2013 15:27:19 +1100
Subject: xfs: add tracepoints to AGF/AGI read operations

To help track down AGI/AGF lock ordering issues, I added these
tracepoints to tell us when an AGI or AGF is read and locked.  With
these we can now determine if the lock ordering goes wrong from
tracing captures.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_alloc.c  |  5 ++++-
 fs/xfs/xfs_ialloc.c |  6 +++++-
 fs/xfs/xfs_trace.h  | 25 +++++++++++++++++++++++++
 3 files changed, 34 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index bcf16528bac5..9eab2dfdcbb5 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2294,6 +2294,8 @@ xfs_read_agf(
 {
 	int		error;
 
+	trace_xfs_read_agf(mp, agno);
+
 	ASSERT(agno != NULLAGNUMBER);
 	error = xfs_trans_read_buf(
 			mp, tp, mp->m_ddev_targp,
@@ -2324,8 +2326,9 @@ xfs_alloc_read_agf(
 	struct xfs_perag	*pag;		/* per allocation group data */
 	int			error;
 
-	ASSERT(agno != NULLAGNUMBER);
+	trace_xfs_alloc_read_agf(mp, agno);
 
+	ASSERT(agno != NULLAGNUMBER);
 	error = xfs_read_agf(mp, tp, agno,
 			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
 			bpp);
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 14d732f61a41..e87719c5bebe 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -40,6 +40,7 @@
 #include "xfs_icreate_item.h"
 #include "xfs_icache.h"
 #include "xfs_dinode.h"
+#include "xfs_trace.h"
 
 
 /*
@@ -1627,8 +1628,9 @@ xfs_read_agi(
 {
 	int			error;
 
-	ASSERT(agno != NULLAGNUMBER);
+	trace_xfs_read_agi(mp, agno);
 
+	ASSERT(agno != NULLAGNUMBER);
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 			XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
 			XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
@@ -1651,6 +1653,8 @@ xfs_ialloc_read_agi(
 	struct xfs_perag	*pag;	/* per allocation group data */
 	int			error;
 
+	trace_xfs_ialloc_read_agi(mp, agno);
+
 	error = xfs_read_agi(mp, tp, agno, bpp);
 	if (error)
 		return error;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index f195476a037c..425dfa45b9a0 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -135,6 +135,31 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
 DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks);
 DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks);
 
+DECLARE_EVENT_CLASS(xfs_ag_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),
+	TP_ARGS(mp, agno),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+	),
+	TP_printk("dev %d:%d agno %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno)
+);
+#define DEFINE_AG_EVENT(name)	\
+DEFINE_EVENT(xfs_ag_class, name,	\
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),	\
+	TP_ARGS(mp, agno))
+
+DEFINE_AG_EVENT(xfs_read_agf);
+DEFINE_AG_EVENT(xfs_alloc_read_agf);
+DEFINE_AG_EVENT(xfs_read_agi);
+DEFINE_AG_EVENT(xfs_ialloc_read_agi);
+
 TRACE_EVENT(xfs_attr_list_node_descend,
 	TP_PROTO(struct xfs_attr_list_context *ctx,
 		 struct xfs_da_node_entry *btree),
-- 
cgit v1.3


From 359d992bcd398273637cd9edde10afca953783c4 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Mon, 4 Nov 2013 18:21:05 +0800
Subject: xfs: simplify kmem_{zone_}zalloc

Introduce flag KM_ZERO which is used to alloc zeroed entry, and convert
kmem_{zone_}zalloc to call kmem_{zone_}alloc() with KM_ZERO directly,
in order to avoid the setting to zero step.
And following Dave's suggestion, make kmem_{zone_}zalloc static inline
into kmem.h as they're now just a simple wrapper.

V2:
  Make kmem_{zone_}zalloc static inline into kmem.h as Dave suggested.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/kmem.c | 22 ----------------------
 fs/xfs/kmem.h | 21 ++++++++++++++++++---
 2 files changed, 18 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index a02cfb9e3bce..66a36befc5c0 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -62,17 +62,6 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
 	} while (1);
 }
 
-void *
-kmem_zalloc(size_t size, xfs_km_flags_t flags)
-{
-	void	*ptr;
-
-	ptr = kmem_alloc(size, flags);
-	if (ptr)
-		memset((char *)ptr, 0, (int)size);
-	return ptr;
-}
-
 void *
 kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
 {
@@ -128,14 +117,3 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
 		congestion_wait(BLK_RW_ASYNC, HZ/50);
 	} while (1);
 }
-
-void *
-kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags)
-{
-	void	*ptr;
-
-	ptr = kmem_zone_alloc(zone, flags);
-	if (ptr)
-		memset((char *)ptr, 0, kmem_cache_size(zone));
-	return ptr;
-}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 3a7371cab508..64db0e53edea 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -32,6 +32,7 @@ typedef unsigned __bitwise xfs_km_flags_t;
 #define KM_NOSLEEP	((__force xfs_km_flags_t)0x0002u)
 #define KM_NOFS		((__force xfs_km_flags_t)0x0004u)
 #define KM_MAYFAIL	((__force xfs_km_flags_t)0x0008u)
+#define KM_ZERO		((__force xfs_km_flags_t)0x0010u)
 
 /*
  * We use a special process flag to avoid recursive callbacks into
@@ -43,7 +44,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
 {
 	gfp_t	lflags;
 
-	BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL));
+	BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO));
 
 	if (flags & KM_NOSLEEP) {
 		lflags = GFP_ATOMIC | __GFP_NOWARN;
@@ -52,11 +53,14 @@ kmem_flags_convert(xfs_km_flags_t flags)
 		if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
 			lflags &= ~__GFP_FS;
 	}
+
+	if (flags & KM_ZERO)
+		lflags |= __GFP_ZERO;
+
 	return lflags;
 }
 
 extern void *kmem_alloc(size_t, xfs_km_flags_t);
-extern void *kmem_zalloc(size_t, xfs_km_flags_t);
 extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t);
 extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
 extern void  kmem_free(const void *);
@@ -64,6 +68,12 @@ extern void  kmem_free(const void *);
 
 extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
 
+static inline void *
+kmem_zalloc(size_t size, xfs_km_flags_t flags)
+{
+	return kmem_alloc(size, flags | KM_ZERO);
+}
+
 /*
  * Zone interfaces
  */
@@ -102,6 +112,11 @@ kmem_zone_destroy(kmem_zone_t *zone)
 }
 
 extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t);
-extern void *kmem_zone_zalloc(kmem_zone_t *, xfs_km_flags_t);
+
+static inline void *
+kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags)
+{
+	return kmem_zone_alloc(zone, flags | KM_ZERO);
+}
 
 #endif /* __XFS_SUPPORT_KMEM_H__ */
-- 
cgit v1.3


From a1212d278c05ca0a38f5cbd7ae90ac2e367228a8 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 7 Nov 2013 20:47:28 +0900
Subject: Revert "sysfs: drop kobj_ns_type handling"

This reverts commit cb26a311578e67769e92a39a0a63476533cb7e12.

It mysteriously causes NetworkManager to not find the wireless device
for me.  As far as I can tell, Tejun *meant* for this commit to not make
any semantic changes, but there clearly are some.  So revert it, taking
into account some of the calling convention changes that happened in
this area in subsequent commits.

Cc: Tejun Heo <tj@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/sysfs/dir.c     | 88 +++++++++++++++++++++++++++++++++++++++---------------
 fs/sysfs/mount.c   | 24 +++++++++++----
 fs/sysfs/symlink.c | 27 +++++++++++++----
 fs/sysfs/sysfs.h   | 25 +++++++++++-----
 lib/kobject.c      |  5 +---
 5 files changed, 122 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 08c66969d52a..5e73d6626e50 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -111,11 +111,6 @@ static int sysfs_link_sibling(struct sysfs_dirent *sd)
 	/* add new node and rebalance the tree */
 	rb_link_node(&sd->s_rb, parent, node);
 	rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children);
-
-	/* if @sd has ns tag, mark the parent to enable ns filtering */
-	if (sd->s_ns)
-		sd->s_parent->s_flags |= SYSFS_FLAG_HAS_NS;
-
 	return 0;
 }
 
@@ -135,13 +130,6 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
 		sd->s_parent->s_dir.subdirs--;
 
 	rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children);
-
-	/*
-	 * Either all or none of the children have tags.  Clearing HAS_NS
-	 * when there's no child left is enough to keep the flag synced.
-	 */
-	if (RB_EMPTY_ROOT(&sd->s_parent->s_dir.children))
-		sd->s_parent->s_flags &= ~SYSFS_FLAG_HAS_NS;
 }
 
 /**
@@ -291,6 +279,7 @@ static int sysfs_dentry_delete(const struct dentry *dentry)
 static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct sysfs_dirent *sd;
+	int type;
 
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
@@ -311,8 +300,13 @@ static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
 		goto out_bad;
 
 	/* The sysfs dirent has been moved to a different namespace */
-	if (sd->s_ns && sd->s_ns != sysfs_info(dentry->d_sb)->ns)
-		goto out_bad;
+	type = KOBJ_NS_TYPE_NONE;
+	if (sd->s_parent) {
+		type = sysfs_ns_type(sd->s_parent);
+		if (type != KOBJ_NS_TYPE_NONE &&
+				sysfs_info(dentry->d_sb)->ns[type] != sd->s_ns)
+			goto out_bad;
+	}
 
 	mutex_unlock(&sysfs_mutex);
 out_valid:
@@ -432,6 +426,13 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
 	struct sysfs_inode_attrs *ps_iattr;
 	int ret;
 
+	if (!!sysfs_ns_type(parent_sd) != !!sd->s_ns) {
+		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
+			sysfs_ns_type(parent_sd) ? "required" : "invalid",
+			parent_sd->s_name, sd->s_name);
+		return -EINVAL;
+	}
+
 	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
 	sd->s_parent = sysfs_get(parent_sd);
 
@@ -611,6 +612,13 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 	struct rb_node *node = parent_sd->s_dir.children.rb_node;
 	unsigned int hash;
 
+	if (!!sysfs_ns_type(parent_sd) != !!ns) {
+		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
+			sysfs_ns_type(parent_sd) ? "required" : "invalid",
+			parent_sd->s_name, name);
+		return NULL;
+	}
+
 	hash = sysfs_name_hash(name, ns);
 	while (node) {
 		struct sysfs_dirent *sd;
@@ -659,6 +667,7 @@ struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd,
 EXPORT_SYMBOL_GPL(sysfs_get_dirent_ns);
 
 static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
+		      enum kobj_ns_type type,
 		      const char *name, const void *ns,
 		      struct sysfs_dirent **p_sd)
 {
@@ -672,6 +681,7 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 	if (!sd)
 		return -ENOMEM;
 
+	sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT);
 	sd->s_ns = ns;
 	sd->s_dir.kobj = kobj;
 
@@ -691,7 +701,33 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 int sysfs_create_subdir(struct kobject *kobj, const char *name,
 			struct sysfs_dirent **p_sd)
 {
-	return create_dir(kobj, kobj->sd, name, NULL, p_sd);
+	return create_dir(kobj, kobj->sd,
+			  KOBJ_NS_TYPE_NONE, name, NULL, p_sd);
+}
+
+/**
+ *	sysfs_read_ns_type: return associated ns_type
+ *	@kobj: the kobject being queried
+ *
+ *	Each kobject can be tagged with exactly one namespace type
+ *	(i.e. network or user).  Return the ns_type associated with
+ *	this object if any
+ */
+static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
+{
+	const struct kobj_ns_type_operations *ops;
+	enum kobj_ns_type type;
+
+	ops = kobj_child_ns_ops(kobj);
+	if (!ops)
+		return KOBJ_NS_TYPE_NONE;
+
+	type = ops->type;
+	BUG_ON(type <= KOBJ_NS_TYPE_NONE);
+	BUG_ON(type >= KOBJ_NS_TYPES);
+	BUG_ON(!kobj_ns_type_registered(type));
+
+	return type;
 }
 
 /**
@@ -701,6 +737,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
  */
 int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 {
+	enum kobj_ns_type type;
 	struct sysfs_dirent *parent_sd, *sd;
 	int error = 0;
 
@@ -714,7 +751,9 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 	if (!parent_sd)
 		return -ENOENT;
 
-	error = create_dir(kobj, parent_sd, kobject_name(kobj), ns, &sd);
+	type = sysfs_read_ns_type(kobj);
+
+	error = create_dir(kobj, parent_sd, type, kobject_name(kobj), ns, &sd);
 	if (!error)
 		kobj->sd = sd;
 	return error;
@@ -728,12 +767,13 @@ static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
 	struct sysfs_dirent *parent_sd = parent->d_fsdata;
 	struct sysfs_dirent *sd;
 	struct inode *inode;
-	const void *ns = NULL;
+	enum kobj_ns_type type;
+	const void *ns;
 
 	mutex_lock(&sysfs_mutex);
 
-	if (parent_sd->s_flags & SYSFS_FLAG_HAS_NS)
-		ns = sysfs_info(dir->i_sb)->ns;
+	type = sysfs_ns_type(parent_sd);
+	ns = sysfs_info(dir->i_sb)->ns[type];
 
 	sd = sysfs_find_dirent(parent_sd, dentry->d_name.name, ns);
 
@@ -1056,15 +1096,15 @@ static int sysfs_readdir(struct file *file, struct dir_context *ctx)
 	struct dentry *dentry = file->f_path.dentry;
 	struct sysfs_dirent *parent_sd = dentry->d_fsdata;
 	struct sysfs_dirent *pos = file->private_data;
-	const void *ns = NULL;
+	enum kobj_ns_type type;
+	const void *ns;
+
+	type = sysfs_ns_type(parent_sd);
+	ns = sysfs_info(dentry->d_sb)->ns[type];
 
 	if (!dir_emit_dots(file, ctx))
 		return 0;
 	mutex_lock(&sysfs_mutex);
-
-	if (parent_sd->s_flags & SYSFS_FLAG_HAS_NS)
-		ns = sysfs_info(dentry->d_sb)->ns;
-
 	for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
 	     pos;
 	     pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 8c24bce2f4ae..834ec2cdb7a3 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -36,7 +36,7 @@ static const struct super_operations sysfs_ops = {
 struct sysfs_dirent sysfs_root = {
 	.s_name		= "",
 	.s_count	= ATOMIC_INIT(1),
-	.s_flags	= SYSFS_DIR,
+	.s_flags	= SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
 	.s_mode		= S_IFDIR | S_IRUGO | S_IXUGO,
 	.s_ino		= 1,
 };
@@ -77,8 +77,14 @@ static int sysfs_test_super(struct super_block *sb, void *data)
 {
 	struct sysfs_super_info *sb_info = sysfs_info(sb);
 	struct sysfs_super_info *info = data;
+	enum kobj_ns_type type;
+	int found = 1;
 
-	return sb_info->ns == info->ns;
+	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
+		if (sb_info->ns[type] != info->ns[type])
+			found = 0;
+	}
+	return found;
 }
 
 static int sysfs_set_super(struct super_block *sb, void *data)
@@ -92,7 +98,9 @@ static int sysfs_set_super(struct super_block *sb, void *data)
 
 static void free_sysfs_super_info(struct sysfs_super_info *info)
 {
-	kobj_ns_drop(KOBJ_NS_TYPE_NET, info->ns);
+	int type;
+	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
+		kobj_ns_drop(type, info->ns[type]);
 	kfree(info);
 }
 
@@ -100,6 +108,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
 	struct sysfs_super_info *info;
+	enum kobj_ns_type type;
 	struct super_block *sb;
 	int error;
 
@@ -107,15 +116,18 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 		if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
 			return ERR_PTR(-EPERM);
 
-		if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
-			return ERR_PTR(-EPERM);
+		for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
+			if (!kobj_ns_current_may_mount(type))
+				return ERR_PTR(-EPERM);
+		}
 	}
 
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
 		return ERR_PTR(-ENOMEM);
 
-	info->ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
+	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
+		info->ns[type] = kobj_ns_grab_current(type);
 
 	sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info);
 	if (IS_ERR(sb) || sb->s_fs_info != info)
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 1a23681b8179..3ae3f1bf1a09 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -28,6 +28,7 @@ static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
 	struct sysfs_dirent *target_sd = NULL;
 	struct sysfs_dirent *sd = NULL;
 	struct sysfs_addrm_cxt acxt;
+	enum kobj_ns_type ns_type;
 	int error;
 
 	BUG_ON(!name || !parent_sd);
@@ -51,15 +52,29 @@ static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
 	if (!sd)
 		goto out_put;
 
-	sd->s_ns = target_sd->s_ns;
+	ns_type = sysfs_ns_type(parent_sd);
+	if (ns_type)
+		sd->s_ns = target_sd->s_ns;
 	sd->s_symlink.target_sd = target_sd;
 	target_sd = NULL;	/* reference is now owned by the symlink */
 
 	sysfs_addrm_start(&acxt);
-	if (warn)
-		error = sysfs_add_one(&acxt, sd, parent_sd);
-	else
-		error = __sysfs_add_one(&acxt, sd, parent_sd);
+	/* Symlinks must be between directories with the same ns_type */
+	if (!ns_type ||
+	    (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) {
+		if (warn)
+			error = sysfs_add_one(&acxt, sd, parent_sd);
+		else
+			error = __sysfs_add_one(&acxt, sd, parent_sd);
+	} else {
+		error = -EINVAL;
+		WARN(1, KERN_WARNING
+			"sysfs: symlink across ns_types %s/%s -> %s/%s\n",
+			parent_sd->s_name,
+			sd->s_name,
+			sd->s_symlink.target_sd->s_parent->s_name,
+			sd->s_symlink.target_sd->s_name);
+	}
 	sysfs_addrm_finish(&acxt);
 
 	if (error)
@@ -149,7 +164,7 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
 	 * sysfs_remove_dir() for details.
 	 */
 	spin_lock(&sysfs_symlink_target_lock);
-	if (targ->sd)
+	if (targ->sd && sysfs_ns_type(kobj->sd))
 		ns = targ->sd->s_ns;
 	spin_unlock(&sysfs_symlink_target_lock);
 	sysfs_hash_and_remove(kobj->sd, name, ns);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index e3aea92ebfa3..0af09fbfb3f6 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -90,8 +90,11 @@ struct sysfs_dirent {
 #define SYSFS_COPY_NAME			(SYSFS_DIR | SYSFS_KOBJ_LINK)
 #define SYSFS_ACTIVE_REF		(SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
 
-#define SYSFS_FLAG_MASK			~SYSFS_TYPE_MASK
-#define SYSFS_FLAG_HAS_NS		0x01000
+/* identify any namespace tag on sysfs_dirents */
+#define SYSFS_NS_TYPE_MASK		0xf00
+#define SYSFS_NS_TYPE_SHIFT		8
+
+#define SYSFS_FLAG_MASK			~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
 #define SYSFS_FLAG_REMOVED		0x02000
 
 static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
@@ -99,6 +102,15 @@ static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
 	return sd->s_flags & SYSFS_TYPE_MASK;
 }
 
+/*
+ * Return any namespace tags on this dirent.
+ * enum kobj_ns_type is defined in linux/kobject.h
+ */
+static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
+{
+	return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;
+}
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
 #define sysfs_dirent_init_lockdep(sd)				\
@@ -143,13 +155,12 @@ struct sysfs_addrm_cxt {
  */
 
 /*
- * Each sb is associated with one namespace tag, currently the network
- * namespace of the task which mounted this sysfs instance.  If multiple
- * tags become necessary, make the following an array and compare
- * sysfs_dirent tag against every entry.
+ * Each sb is associated with a set of namespace tags (i.e.
+ * the network namespace of the task which mounted this sysfs
+ * instance).
  */
 struct sysfs_super_info {
-	void *ns;
+	void *ns[KOBJ_NS_TYPES];
 };
 #define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
 extern struct sysfs_dirent sysfs_root;
diff --git a/lib/kobject.c b/lib/kobject.c
index 7a1c203083eb..5b4b8886435e 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -30,14 +30,11 @@
 const void *kobject_namespace(struct kobject *kobj)
 {
 	const struct kobj_ns_type_operations *ns_ops = kobj_ns_ops(kobj);
-	const void *ns;
 
 	if (!ns_ops || ns_ops->type == KOBJ_NS_TYPE_NONE)
 		return NULL;
 
-	ns = kobj->ktype->namespace(kobj);
-	WARN_ON(!ns);	/* @kobj in a namespace is required to have !NULL tag */
-	return ns;
+	return kobj->ktype->namespace(kobj);
 }
 
 /*
-- 
cgit v1.3


From f2754114400afe38ad5ade3b588451c4f36a61af Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 7 Nov 2013 22:22:08 -0500
Subject: ext4: remove unreachable code after ext4_can_extents_be_merged()

Commit ec22ba8e ("ext4: disable merging of uninitialized extents")
ensured that if either extent under consideration is uninit, we
decline to merge, and ext4_can_extents_be_merged() returns false.

So there is no need for the caller to then test whether the
extent under consideration is unitialized; if it were, we
wouldn't have gotten that far.

The comments were also inaccurate; ext4_can_extents_be_merged()
no longer XORs the states, it fails if *either* is uninit.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
---
 fs/ext4/extents.c | 26 --------------------------
 1 file changed, 26 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index de6d4674ba6e..35f65cf4f318 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1715,7 +1715,6 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
 	struct ext4_extent_header *eh;
 	unsigned int depth, len;
 	int merge_done = 0;
-	int uninitialized = 0;
 
 	depth = ext_depth(inode);
 	BUG_ON(path[depth].p_hdr == NULL);
@@ -1725,12 +1724,8 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
 		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
 			break;
 		/* merge with next extent! */
-		if (ext4_ext_is_uninitialized(ex))
-			uninitialized = 1;
 		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
 				+ ext4_ext_get_actual_len(ex + 1));
-		if (uninitialized)
-			ext4_ext_mark_uninitialized(ex);
 
 		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
 			len = (EXT_LAST_EXTENT(eh) - ex - 1)
@@ -1885,7 +1880,6 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	struct ext4_ext_path *npath = NULL;
 	int depth, len, err;
 	ext4_lblk_t next;
-	unsigned uninitialized = 0;
 	int mb_flags = 0;
 
 	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
@@ -1937,18 +1931,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 			if (err)
 				return err;
 
-			/*
-			 * ext4_can_extents_be_merged should have checked
-			 * that either both extents are uninitialized, or
-			 * both aren't. Thus we need to check only one of
-			 * them here.
-			 */
-			if (ext4_ext_is_uninitialized(ex))
-				uninitialized = 1;
 			ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
 					+ ext4_ext_get_actual_len(newext));
-			if (uninitialized)
-				ext4_ext_mark_uninitialized(ex);
 			eh = path[depth].p_hdr;
 			nearex = ex;
 			goto merge;
@@ -1971,20 +1955,10 @@ prepend:
 			if (err)
 				return err;
 
-			/*
-			 * ext4_can_extents_be_merged should have checked
-			 * that either both extents are uninitialized, or
-			 * both aren't. Thus we need to check only one of
-			 * them here.
-			 */
-			if (ext4_ext_is_uninitialized(ex))
-				uninitialized = 1;
 			ex->ee_block = newext->ee_block;
 			ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
 			ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
 					+ ext4_ext_get_actual_len(newext));
-			if (uninitialized)
-				ext4_ext_mark_uninitialized(ex);
 			eh = path[depth].p_hdr;
 			nearex = ex;
 			goto merge;
-- 
cgit v1.3


From fb51b5ef9c07844f80402702bd3d3002ceca5cd9 Mon Sep 17 00:00:00 2001
From: Changman Lee <cm224.lee@samsung.com>
Date: Thu, 7 Nov 2013 12:48:25 +0900
Subject: f2fs: cleanup waiting routine for writeback pages in cp

use genernal method supported by kernel

 o changes from v1
   If any waiter exists at end io, wake up it.

Signed-off-by: Changman Lee <cm224.lee@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/checkpoint.c | 25 ++++++++++++++++---------
 fs/f2fs/f2fs.h       |  2 +-
 fs/f2fs/segment.c    |  5 +++--
 fs/f2fs/super.c      |  1 +
 4 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index d430157ffe60..5716e5eb4e8e 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -634,6 +634,21 @@ static void unblock_operations(struct f2fs_sb_info *sbi)
 	f2fs_unlock_all(sbi);
 }
 
+static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
+{
+	DEFINE_WAIT(wait);
+
+	for (;;) {
+		prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
+
+		if (!get_pages(sbi, F2FS_WRITEBACK))
+			break;
+
+		io_schedule();
+	}
+	finish_wait(&sbi->cp_wait, &wait);
+}
+
 static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -743,15 +758,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 	f2fs_put_page(cp_page, 1);
 
 	/* wait for previous submitted node/meta pages writeback */
-	sbi->cp_task = current;
-	while (get_pages(sbi, F2FS_WRITEBACK)) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (!get_pages(sbi, F2FS_WRITEBACK))
-			break;
-		io_schedule();
-	}
-	__set_current_state(TASK_RUNNING);
-	sbi->cp_task = NULL;
+	wait_on_all_pages_writeback(sbi);
 
 	filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX);
 	filemap_fdatawait_range(sbi->meta_inode->i_mapping, 0, LONG_MAX);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 625eb4befad4..89dc7508faf2 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -372,7 +372,7 @@ struct f2fs_sb_info {
 	struct mutex writepages;		/* mutex for writepages() */
 	bool por_doing;				/* recovery is doing or not */
 	bool on_build_free_nids;		/* build_free_nids is doing */
-	struct task_struct *cp_task;		/* checkpoint task */
+	wait_queue_head_t cp_wait;
 
 	/* for orphan inode management */
 	struct list_head orphan_inode_list;	/* orphan inode list */
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index ff363e686b7e..86dc28949192 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -592,8 +592,9 @@ static void f2fs_end_io_write(struct bio *bio, int err)
 	if (p->is_sync)
 		complete(p->wait);
 
-	if (!get_pages(p->sbi, F2FS_WRITEBACK) && p->sbi->cp_task)
-		wake_up_process(p->sbi->cp_task);
+	if (!get_pages(p->sbi, F2FS_WRITEBACK) &&
+			!list_empty(&p->sbi->cp_wait.task_list))
+		wake_up(&p->sbi->cp_wait);
 
 	kfree(p);
 	bio_put(bio);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index e42351cbe166..00e79df5a34f 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -876,6 +876,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 	spin_lock_init(&sbi->stat_lock);
 	init_rwsem(&sbi->bio_sem);
 	init_rwsem(&sbi->cp_rwsem);
+	init_waitqueue_head(&sbi->cp_wait);
 	init_sb_info(sbi);
 
 	/* get an inode for meta space */
-- 
cgit v1.3


From 1d15bd2034c9b0f9ac88fa0e4cd33e0d792b0bb0 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Thu, 7 Nov 2013 10:14:51 +0800
Subject: f2fs: fix memory leak after kobject init failed in fill_super

If we failed to init&add kobject when fill_super, stats info and proc object of
f2fs will not be released.
We should free them before we finish fill_super.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/super.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 00e79df5a34f..bafff72de8e8 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -975,12 +975,12 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 		/* After POR, we can run background GC thread.*/
 		err = start_gc_thread(sbi);
 		if (err)
-			goto fail;
+			goto free_gc;
 	}
 
 	err = f2fs_build_stats(sbi);
 	if (err)
-		goto fail;
+		goto free_gc;
 
 	if (f2fs_proc_root)
 		sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
@@ -1006,6 +1006,12 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 
 	return 0;
 fail:
+	if (sbi->s_proc) {
+		remove_proc_entry("segment_info", sbi->s_proc);
+		remove_proc_entry(sb->s_id, f2fs_proc_root);
+	}
+	f2fs_destroy_stats(sbi);
+free_gc:
 	stop_gc_thread(sbi);
 free_root_inode:
 	dput(sb->s_root);
-- 
cgit v1.3


From dd1f723bf56bd96efc9d90e9e60dc511c79de48f Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 8 Nov 2013 00:14:53 -0500
Subject: ext4: use prandom_u32() instead of get_random_bytes()

Many of the uses of get_random_bytes() do not actually need
cryptographically secure random numbers.  Replace those uses with a
call to prandom_u32(), which is faster and which doesn't consume
entropy from the /dev/random driver.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 2 +-
 fs/ext4/mmp.c    | 2 +-
 fs/ext4/super.c  | 7 ++-----
 3 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 137193ff389b..0ee59a6644e2 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -432,7 +432,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 			ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
 			grp = hinfo.hash;
 		} else
-			get_random_bytes(&grp, sizeof(grp));
+			grp = prandom_u32();
 		parent_group = (unsigned)grp % ngroups;
 		for (i = 0; i < ngroups; i++) {
 			g = (parent_group + i) % ngroups;
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 214461e42a05..04434ad3e8e0 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -259,7 +259,7 @@ static unsigned int mmp_new_seq(void)
 	u32 new_seq;
 
 	do {
-		get_random_bytes(&new_seq, sizeof(u32));
+		new_seq = prandom_u32();
 	} while (new_seq > EXT4_MMP_SEQ_MAX);
 
 	return new_seq;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d3a857bfae47..c977f4e4e63b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3068,7 +3068,6 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_li_request *elr;
-	unsigned long rnd;
 
 	elr = kzalloc(sizeof(*elr), GFP_KERNEL);
 	if (!elr)
@@ -3083,10 +3082,8 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
 	 * spread the inode table initialization requests
 	 * better.
 	 */
-	get_random_bytes(&rnd, sizeof(rnd));
-	elr->lr_next_sched = jiffies + (unsigned long)rnd %
-			     (EXT4_DEF_LI_MAX_START_DELAY * HZ);
-
+	elr->lr_next_sched = jiffies + (prandom_u32() %
+				(EXT4_DEF_LI_MAX_START_DELAY * HZ));
 	return elr;
 }
 
-- 
cgit v1.3


From 8077c0d983ab276ec5f2700df56a64d671781905 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 14 Oct 2013 12:14:13 -0400
Subject: bdi: test bdi_init failure

There were two places where return value from bdi_init was not tested.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/char_dev.c               | 3 ++-
 include/linux/backing-dev.h | 4 ++--
 mm/swap.c                   | 3 ++-
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/char_dev.c b/fs/char_dev.c
index afc2bb691780..b9bc05b9513a 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -574,7 +574,8 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data)
 void __init chrdev_init(void)
 {
 	cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
-	bdi_init(&directly_mappable_cdev_bdi);
+	if (bdi_init(&directly_mappable_cdev_bdi))
+		panic("Failed to init directly mappable cdev bdi");
 }
 
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 5f66d519a726..24819001f5c8 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -109,7 +109,7 @@ struct backing_dev_info {
 #endif
 };
 
-int bdi_init(struct backing_dev_info *bdi);
+int __must_check bdi_init(struct backing_dev_info *bdi);
 void bdi_destroy(struct backing_dev_info *bdi);
 
 __printf(3, 4)
@@ -117,7 +117,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 void bdi_unregister(struct backing_dev_info *bdi);
-int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
+int __must_check bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 			enum wb_reason reason);
 void bdi_start_background_writeback(struct backing_dev_info *bdi);
diff --git a/mm/swap.c b/mm/swap.c
index 759c3caf44bd..7a9f80d451f5 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -934,7 +934,8 @@ void __init swap_setup(void)
 #ifdef CONFIG_SWAP
 	int i;
 
-	bdi_init(swapper_spaces[0].backing_dev_info);
+	if (bdi_init(swapper_spaces[0].backing_dev_info))
+		panic("Failed to init swap bdi");
 	for (i = 0; i < MAX_SWAPFILES; i++) {
 		spin_lock_init(&swapper_spaces[i].tree_lock);
 		INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
-- 
cgit v1.3


From 170d800af83f3ab2b5ced0e370a861e023dee22a Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 15 Oct 2013 12:22:29 -0600
Subject: block: Replace __get_cpu_var uses

__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x).  This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.

Other use cases are for storing and retrieving data from the current
processors percpu area.  __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.

__get_cpu_var() is defined as :

#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))

__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.

this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.

This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset.  Thereby address calculations are avoided and less registers
are used when code is generated.

At the end of the patch set all uses of __get_cpu_var have been removed so
the macro is removed too.

The patch set includes passes over all arches as well. Once these operations
are used throughout then specialized macros can be defined in non -x86
arches as well in order to optimize per cpu access by f.e.  using a global
register that may be set to the per cpu base.

Transformations done to __get_cpu_var()

1. Determine the address of the percpu instance of the current processor.

	DEFINE_PER_CPU(int, y);
	int *x = &__get_cpu_var(y);

    Converts to

	int *x = this_cpu_ptr(&y);

2. Same as #1 but this time an array structure is involved.

	DEFINE_PER_CPU(int, y[20]);
	int *x = __get_cpu_var(y);

    Converts to

	int *x = this_cpu_ptr(y);

3. Retrieve the content of the current processors instance of a per cpu
variable.

	DEFINE_PER_CPU(int, y);
	int x = __get_cpu_var(y)

   Converts to

	int x = __this_cpu_read(y);

4. Retrieve the content of a percpu struct

	DEFINE_PER_CPU(struct mystruct, y);
	struct mystruct x = __get_cpu_var(y);

   Converts to

	memcpy(&x, this_cpu_ptr(&y), sizeof(x));

5. Assignment to a per cpu variable

	DEFINE_PER_CPU(int, y)
	__get_cpu_var(y) = x;

   Converts to

	this_cpu_write(y, x);

6. Increment/Decrement etc of a per cpu variable

	DEFINE_PER_CPU(int, y);
	__get_cpu_var(y)++

   Converts to

	this_cpu_inc(y)

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-iopoll.c  | 6 +++---
 block/blk-softirq.c | 8 ++++----
 fs/fscache/object.c | 2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
index 4b8d9b541112..1855bf51edb0 100644
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -35,7 +35,7 @@ void blk_iopoll_sched(struct blk_iopoll *iop)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	list_add_tail(&iop->list, &__get_cpu_var(blk_cpu_iopoll));
+	list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
 	__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
 	local_irq_restore(flags);
 }
@@ -79,7 +79,7 @@ EXPORT_SYMBOL(blk_iopoll_complete);
 
 static void blk_iopoll_softirq(struct softirq_action *h)
 {
-	struct list_head *list = &__get_cpu_var(blk_cpu_iopoll);
+	struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
 	int rearm = 0, budget = blk_iopoll_budget;
 	unsigned long start_time = jiffies;
 
@@ -201,7 +201,7 @@ static int blk_iopoll_cpu_notify(struct notifier_block *self,
 
 		local_irq_disable();
 		list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
-				 &__get_cpu_var(blk_cpu_iopoll));
+				 this_cpu_ptr(&blk_cpu_iopoll));
 		__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
 		local_irq_enable();
 	}
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index ec9e60636f43..ce4b8bfd3d27 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -23,7 +23,7 @@ static void blk_done_softirq(struct softirq_action *h)
 	struct list_head *cpu_list, local_list;
 
 	local_irq_disable();
-	cpu_list = &__get_cpu_var(blk_cpu_done);
+	cpu_list = this_cpu_ptr(&blk_cpu_done);
 	list_replace_init(cpu_list, &local_list);
 	local_irq_enable();
 
@@ -44,7 +44,7 @@ static void trigger_softirq(void *data)
 	struct list_head *list;
 
 	local_irq_save(flags);
-	list = &__get_cpu_var(blk_cpu_done);
+	list = this_cpu_ptr(&blk_cpu_done);
 	list_add_tail(&rq->csd.list, list);
 
 	if (list->next == &rq->csd.list)
@@ -90,7 +90,7 @@ static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
 
 		local_irq_disable();
 		list_splice_init(&per_cpu(blk_cpu_done, cpu),
-				 &__get_cpu_var(blk_cpu_done));
+				 this_cpu_ptr(&blk_cpu_done));
 		raise_softirq_irqoff(BLOCK_SOFTIRQ);
 		local_irq_enable();
 	}
@@ -135,7 +135,7 @@ void __blk_complete_request(struct request *req)
 	if (ccpu == cpu || shared) {
 		struct list_head *list;
 do_local:
-		list = &__get_cpu_var(blk_cpu_done);
+		list = this_cpu_ptr(&blk_cpu_done);
 		list_add_tail(&req->csd.list, list);
 
 		/*
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 86d75a60b20c..3b0712213993 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -796,7 +796,7 @@ void fscache_enqueue_object(struct fscache_object *object)
  */
 bool fscache_object_sleep_till_congested(signed long *timeoutp)
 {
-	wait_queue_head_t *cong_wq = &__get_cpu_var(fscache_object_cong_wait);
+	wait_queue_head_t *cong_wq = this_cpu_ptr(&fscache_object_cong_wait);
 	DEFINE_WAIT(wait);
 
 	if (fscache_object_congested())
-- 
cgit v1.3


From 6678d83f18386eb103f8345024e52c5abe61725c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 7 Aug 2013 11:14:32 -0700
Subject: block: Consolidate duplicated bio_trim() implementations

Someone cut and pasted md's md_trim_bio() into xen-blkfront.c. Come on,
we should know better than this.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Neil Brown <neilb@suse.de>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/xen-blkfront.c | 53 +-------------------------------------------
 drivers/md/md.c              | 40 ---------------------------------
 drivers/md/md.h              |  1 -
 drivers/md/raid1.c           | 10 ++++-----
 drivers/md/raid10.c          | 18 +++++++--------
 fs/bio.c                     | 46 ++++++++++++++++++++++++++++++++++++++
 include/linux/bio.h          |  1 +
 7 files changed, 61 insertions(+), 108 deletions(-)

(limited to 'fs')

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index a4660bbee8a6..8d53ed293606 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1336,57 +1336,6 @@ static int blkfront_probe(struct xenbus_device *dev,
 	return 0;
 }
 
-/*
- * This is a clone of md_trim_bio, used to split a bio into smaller ones
- */
-static void trim_bio(struct bio *bio, int offset, int size)
-{
-	/* 'bio' is a cloned bio which we need to trim to match
-	 * the given offset and size.
-	 * This requires adjusting bi_sector, bi_size, and bi_io_vec
-	 */
-	int i;
-	struct bio_vec *bvec;
-	int sofar = 0;
-
-	size <<= 9;
-	if (offset == 0 && size == bio->bi_size)
-		return;
-
-	bio->bi_sector += offset;
-	bio->bi_size = size;
-	offset <<= 9;
-	clear_bit(BIO_SEG_VALID, &bio->bi_flags);
-
-	while (bio->bi_idx < bio->bi_vcnt &&
-	       bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
-		/* remove this whole bio_vec */
-		offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
-		bio->bi_idx++;
-	}
-	if (bio->bi_idx < bio->bi_vcnt) {
-		bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
-		bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
-	}
-	/* avoid any complications with bi_idx being non-zero*/
-	if (bio->bi_idx) {
-		memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
-			(bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
-		bio->bi_vcnt -= bio->bi_idx;
-		bio->bi_idx = 0;
-	}
-	/* Make sure vcnt and last bv are not too big */
-	bio_for_each_segment(bvec, bio, i) {
-		if (sofar + bvec->bv_len > size)
-			bvec->bv_len = size - sofar;
-		if (bvec->bv_len == 0) {
-			bio->bi_vcnt = i;
-			break;
-		}
-		sofar += bvec->bv_len;
-	}
-}
-
 static void split_bio_end(struct bio *bio, int error)
 {
 	struct split_bio *split_bio = bio->bi_private;
@@ -1522,7 +1471,7 @@ static int blkif_recover(struct blkfront_info *info)
 					   (unsigned int)(bio->bi_size >> 9) - offset);
 				cloned_bio = bio_clone(bio, GFP_NOIO);
 				BUG_ON(cloned_bio == NULL);
-				trim_bio(cloned_bio, offset, size);
+				bio_trim(cloned_bio, offset, size);
 				cloned_bio->bi_private = split_bio;
 				cloned_bio->bi_end_io = split_bio_end;
 				submit_bio(cloned_bio->bi_rw, cloned_bio);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 561a65f82e26..752119068d66 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -183,46 +183,6 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
 }
 EXPORT_SYMBOL_GPL(bio_clone_mddev);
 
-void md_trim_bio(struct bio *bio, int offset, int size)
-{
-	/* 'bio' is a cloned bio which we need to trim to match
-	 * the given offset and size.
-	 * This requires adjusting bi_sector, bi_size, and bi_io_vec
-	 */
-	int i;
-	struct bio_vec *bvec;
-	int sofar = 0;
-
-	size <<= 9;
-	if (offset == 0 && size == bio->bi_size)
-		return;
-
-	clear_bit(BIO_SEG_VALID, &bio->bi_flags);
-
-	bio_advance(bio, offset << 9);
-
-	bio->bi_size = size;
-
-	/* avoid any complications with bi_idx being non-zero*/
-	if (bio->bi_idx) {
-		memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
-			(bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
-		bio->bi_vcnt -= bio->bi_idx;
-		bio->bi_idx = 0;
-	}
-	/* Make sure vcnt and last bv are not too big */
-	bio_for_each_segment(bvec, bio, i) {
-		if (sofar + bvec->bv_len > size)
-			bvec->bv_len = size - sofar;
-		if (bvec->bv_len == 0) {
-			bio->bi_vcnt = i;
-			break;
-		}
-		sofar += bvec->bv_len;
-	}
-}
-EXPORT_SYMBOL_GPL(md_trim_bio);
-
 /*
  * We have a system wide 'event count' that is incremented
  * on any 'interesting' event, and readers of /proc/mdstat
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 608050c43f17..c96456c16700 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -617,7 +617,6 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
 				   struct mddev *mddev);
 extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 				   struct mddev *mddev);
-extern void md_trim_bio(struct bio *bio, int offset, int size);
 
 extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
 static inline int mddev_check_plugged(struct mddev *mddev)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index aacf6bf352d8..af6681b19776 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1097,8 +1097,8 @@ read_again:
 		r1_bio->read_disk = rdisk;
 
 		read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		md_trim_bio(read_bio, r1_bio->sector - bio->bi_sector,
-			    max_sectors);
+		bio_trim(read_bio, r1_bio->sector - bio->bi_sector,
+			 max_sectors);
 
 		r1_bio->bios[rdisk] = read_bio;
 
@@ -1266,7 +1266,7 @@ read_again:
 			continue;
 
 		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors);
+		bio_trim(mbio, r1_bio->sector - bio->bi_sector, max_sectors);
 
 		if (first_clone) {
 			/* do behind I/O ?
@@ -2126,7 +2126,7 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
 		wbio->bi_sector = r1_bio->sector;
 		wbio->bi_size = r1_bio->sectors << 9;
 
-		md_trim_bio(wbio, sector - r1_bio->sector, sectors);
+		bio_trim(wbio, sector - r1_bio->sector, sectors);
 		wbio->bi_sector += rdev->data_offset;
 		wbio->bi_bdev = rdev->bdev;
 		if (submit_bio_wait(WRITE, wbio) == 0)
@@ -2241,7 +2241,7 @@ read_more:
 		}
 		r1_bio->read_disk = disk;
 		bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
-		md_trim_bio(bio, r1_bio->sector - bio->bi_sector, max_sectors);
+		bio_trim(bio, r1_bio->sector - bio->bi_sector, max_sectors);
 		r1_bio->bios[r1_bio->read_disk] = bio;
 		rdev = conf->mirrors[disk].rdev;
 		printk_ratelimited(KERN_ERR
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 73dc8a377522..7c3508abb5e1 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1302,8 +1302,8 @@ read_again:
 		slot = r10_bio->read_slot;
 
 		read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
-			    max_sectors);
+		bio_trim(read_bio, r10_bio->sector - bio->bi_sector,
+			 max_sectors);
 
 		r10_bio->devs[slot].bio = read_bio;
 		r10_bio->devs[slot].rdev = rdev;
@@ -1510,8 +1510,8 @@ retry_write:
 		if (r10_bio->devs[i].bio) {
 			struct md_rdev *rdev = conf->mirrors[d].rdev;
 			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-			md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
-				    max_sectors);
+			bio_trim(mbio, r10_bio->sector - bio->bi_sector,
+				 max_sectors);
 			r10_bio->devs[i].bio = mbio;
 
 			mbio->bi_sector	= (r10_bio->devs[i].addr+
@@ -1553,8 +1553,8 @@ retry_write:
 				rdev = conf->mirrors[d].rdev;
 			}
 			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-			md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
-				    max_sectors);
+			bio_trim(mbio, r10_bio->sector - bio->bi_sector,
+				 max_sectors);
 			r10_bio->devs[i].repl_bio = mbio;
 
 			mbio->bi_sector	= (r10_bio->devs[i].addr +
@@ -2614,7 +2614,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
 			sectors = sect_to_write;
 		/* Write at 'sector' for 'sectors' */
 		wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		md_trim_bio(wbio, sector - bio->bi_sector, sectors);
+		bio_trim(wbio, sector - bio->bi_sector, sectors);
 		wbio->bi_sector = (r10_bio->devs[i].addr+
 				   choose_data_offset(r10_bio, rdev) +
 				   (sector - r10_bio->sector));
@@ -2687,9 +2687,7 @@ read_more:
 		(unsigned long long)r10_bio->sector);
 	bio = bio_clone_mddev(r10_bio->master_bio,
 			      GFP_NOIO, mddev);
-	md_trim_bio(bio,
-		    r10_bio->sector - bio->bi_sector,
-		    max_sectors);
+	bio_trim(bio, r10_bio->sector - bio->bi_sector, max_sectors);
 	r10_bio->devs[slot].bio = bio;
 	r10_bio->devs[slot].rdev = rdev;
 	bio->bi_sector = r10_bio->devs[slot].addr
diff --git a/fs/bio.c b/fs/bio.c
index ea5035da4d9a..2bdb4e25ee77 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1804,6 +1804,52 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 }
 EXPORT_SYMBOL(bio_split);
 
+/**
+ * bio_trim - trim a bio
+ * @bio:	bio to trim
+ * @offset:	number of sectors to trim from the front of @bio
+ * @size:	size we want to trim @bio to, in sectors
+ */
+void bio_trim(struct bio *bio, int offset, int size)
+{
+	/* 'bio' is a cloned bio which we need to trim to match
+	 * the given offset and size.
+	 * This requires adjusting bi_sector, bi_size, and bi_io_vec
+	 */
+	int i;
+	struct bio_vec *bvec;
+	int sofar = 0;
+
+	size <<= 9;
+	if (offset == 0 && size == bio->bi_size)
+		return;
+
+	clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+
+	bio_advance(bio, offset << 9);
+
+	bio->bi_size = size;
+
+	/* avoid any complications with bi_idx being non-zero*/
+	if (bio->bi_idx) {
+		memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
+			(bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
+		bio->bi_vcnt -= bio->bi_idx;
+		bio->bi_idx = 0;
+	}
+	/* Make sure vcnt and last bv are not too big */
+	bio_for_each_segment(bvec, bio, i) {
+		if (sofar + bvec->bv_len > size)
+			bvec->bv_len = size - sofar;
+		if (bvec->bv_len == 0) {
+			bio->bi_vcnt = i;
+			break;
+		}
+		sofar += bvec->bv_len;
+	}
+}
+EXPORT_SYMBOL_GPL(bio_trim);
+
 /**
  *      bio_sector_offset - Find hardware sector offset in bio
  *      @bio:           bio to inspect
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ec48bac5b039..162036aca741 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -218,6 +218,7 @@ struct bio_pair {
 };
 extern struct bio_pair *bio_split(struct bio *bi, int first_sectors);
 extern void bio_pair_release(struct bio_pair *dbio);
+extern void bio_trim(struct bio *bio, int offset, int size);
 
 extern struct bio_set *bioset_create(unsigned int, unsigned int);
 extern void bioset_free(struct bio_set *);
-- 
cgit v1.3


From 42c326082d8a2c91506f951ace638deae1faf083 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 8 Nov 2013 12:31:16 -0500
Subject: switch shrink_dcache_for_umount() to use of d_walk()

we have too many iterators in fs/dcache.c...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 195 ++++++++++++++++++++++++++----------------------------------
 1 file changed, 85 insertions(+), 110 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index c8e83d0d61ac..eb0978da1bd4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1074,116 +1074,6 @@ void shrink_dcache_sb(struct super_block *sb)
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
 
-/*
- * destroy a single subtree of dentries for unmount
- * - see the comments on shrink_dcache_for_umount() for a description of the
- *   locking
- */
-static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
-{
-	struct dentry *parent;
-
-	BUG_ON(!IS_ROOT(dentry));
-
-	for (;;) {
-		/* descend to the first leaf in the current subtree */
-		while (!list_empty(&dentry->d_subdirs))
-			dentry = list_entry(dentry->d_subdirs.next,
-					    struct dentry, d_u.d_child);
-
-		/* consume the dentries from this leaf up through its parents
-		 * until we find one with children or run out altogether */
-		do {
-			struct inode *inode;
-
-			/*
-			 * inform the fs that this dentry is about to be
-			 * unhashed and destroyed.
-			 */
-			if ((dentry->d_flags & DCACHE_OP_PRUNE) &&
-			    !d_unhashed(dentry))
-				dentry->d_op->d_prune(dentry);
-
-			dentry_lru_del(dentry);
-			__d_shrink(dentry);
-
-			if (dentry->d_lockref.count != 0) {
-				printk(KERN_ERR
-				       "BUG: Dentry %p{i=%lx,n=%s}"
-				       " still in use (%d)"
-				       " [unmount of %s %s]\n",
-				       dentry,
-				       dentry->d_inode ?
-				       dentry->d_inode->i_ino : 0UL,
-				       dentry->d_name.name,
-				       dentry->d_lockref.count,
-				       dentry->d_sb->s_type->name,
-				       dentry->d_sb->s_id);
-				BUG();
-			}
-
-			if (IS_ROOT(dentry)) {
-				parent = NULL;
-				list_del(&dentry->d_u.d_child);
-			} else {
-				parent = dentry->d_parent;
-				parent->d_lockref.count--;
-				list_del(&dentry->d_u.d_child);
-			}
-
-			inode = dentry->d_inode;
-			if (inode) {
-				dentry->d_inode = NULL;
-				hlist_del_init(&dentry->d_alias);
-				if (dentry->d_op && dentry->d_op->d_iput)
-					dentry->d_op->d_iput(dentry, inode);
-				else
-					iput(inode);
-			}
-
-			d_free(dentry);
-
-			/* finished when we fall off the top of the tree,
-			 * otherwise we ascend to the parent and move to the
-			 * next sibling if there is one */
-			if (!parent)
-				return;
-			dentry = parent;
-		} while (list_empty(&dentry->d_subdirs));
-
-		dentry = list_entry(dentry->d_subdirs.next,
-				    struct dentry, d_u.d_child);
-	}
-}
-
-/*
- * destroy the dentries attached to a superblock on unmounting
- * - we don't need to use dentry->d_lock because:
- *   - the superblock is detached from all mountings and open files, so the
- *     dentry trees will not be rearranged by the VFS
- *   - s_umount is write-locked, so the memory pressure shrinker will ignore
- *     any dentries belonging to this superblock that it comes across
- *   - the filesystem itself is no longer permitted to rearrange the dentries
- *     in this superblock
- */
-void shrink_dcache_for_umount(struct super_block *sb)
-{
-	struct dentry *dentry;
-
-	if (down_read_trylock(&sb->s_umount))
-		BUG();
-
-	dentry = sb->s_root;
-	sb->s_root = NULL;
-	dentry->d_lockref.count--;
-	shrink_dcache_for_umount_subtree(dentry);
-
-	while (!hlist_bl_empty(&sb->s_anon)) {
-		dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash);
-		shrink_dcache_for_umount_subtree(dentry);
-	}
-}
-
 /*
  * This tries to ascend one level of parenthood, but
  * we can race with renaming, so we need to re-check
@@ -1478,6 +1368,91 @@ void shrink_dcache_parent(struct dentry *parent)
 }
 EXPORT_SYMBOL(shrink_dcache_parent);
 
+static enum d_walk_ret umount_collect(void *_data, struct dentry *dentry)
+{
+	struct select_data *data = _data;
+	enum d_walk_ret ret = D_WALK_CONTINUE;
+
+	if (dentry->d_lockref.count) {
+		dentry_lru_del(dentry);
+		if (likely(!list_empty(&dentry->d_subdirs)))
+			goto out;
+		if (dentry == data->start && dentry->d_lockref.count == 1)
+			goto out;
+		printk(KERN_ERR
+		       "BUG: Dentry %p{i=%lx,n=%s}"
+		       " still in use (%d)"
+		       " [unmount of %s %s]\n",
+		       dentry,
+		       dentry->d_inode ?
+		       dentry->d_inode->i_ino : 0UL,
+		       dentry->d_name.name,
+		       dentry->d_lockref.count,
+		       dentry->d_sb->s_type->name,
+		       dentry->d_sb->s_id);
+		BUG();
+	} else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
+		/*
+		 * We can't use d_lru_shrink_move() because we
+		 * need to get the global LRU lock and do the
+		 * LRU accounting.
+		 */
+		if (dentry->d_flags & DCACHE_LRU_LIST)
+			d_lru_del(dentry);
+		d_shrink_add(dentry, &data->dispose);
+		data->found++;
+		ret = D_WALK_NORETRY;
+	}
+out:
+	if (data->found && need_resched())
+		ret = D_WALK_QUIT;
+	return ret;
+}
+
+/*
+ * destroy the dentries attached to a superblock on unmounting
+ */
+void shrink_dcache_for_umount(struct super_block *sb)
+{
+	struct dentry *dentry;
+
+	if (down_read_trylock(&sb->s_umount))
+		BUG();
+
+	dentry = sb->s_root;
+	sb->s_root = NULL;
+	for (;;) {
+		struct select_data data;
+
+		INIT_LIST_HEAD(&data.dispose);
+		data.start = dentry;
+		data.found = 0;
+
+		d_walk(dentry, &data, umount_collect, NULL);
+		if (!data.found)
+			break;
+
+		shrink_dentry_list(&data.dispose);
+		cond_resched();
+	}
+	d_drop(dentry);
+	dput(dentry);
+
+	while (!hlist_bl_empty(&sb->s_anon)) {
+		struct select_data data;
+		dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash);
+
+		INIT_LIST_HEAD(&data.dispose);
+		data.start = NULL;
+		data.found = 0;
+
+		d_walk(dentry, &data, umount_collect, NULL);
+		if (data.found)
+			shrink_dentry_list(&data.dispose);
+		cond_resched();
+	}
+}
+
 static enum d_walk_ret check_and_collect(void *_data, struct dentry *dentry)
 {
 	struct select_data *data = _data;
-- 
cgit v1.3


From 48a066e72d970a3e225a9c18690d570c736fc455 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 29 Sep 2013 22:06:07 -0400
Subject: RCU'd vfsmounts

* RCU-delayed freeing of vfsmounts
* vfsmount_lock replaced with a seqlock (mount_lock)
* sequence number from mount_lock is stored in nameidata->m_seq and
used when we exit RCU mode
* new vfsmount flag - MNT_SYNC_UMOUNT.  Set by umount_tree() when its
caller knows that vfsmount will have no surviving references.
* synchronize_rcu() done between unlocking namespace_sem in namespace_unlock()
and doing pending mntput().
* new helper: legitimize_mnt(mnt, seq).  Checks the mount_lock sequence
number against seq, then grabs reference to mnt.  Then it rechecks mount_lock
again to close the race and either returns success or drops the reference it
has acquired.  The subtle point is that in case of MNT_SYNC_UMOUNT we can
simply decrement the refcount and sod off - aforementioned synchronize_rcu()
makes sure that final mntput() won't come until we leave RCU mode.  We need
that, since we don't want to end up with some lazy pathwalk racing with
umount() and stealing the final mntput() from it - caller of umount() may
expect it to return only once the fs is shut down and we don't want to break
that.  In other cases (i.e. with MNT_SYNC_UMOUNT absent) we have to do
full-blown mntput() in case of mount_lock sequence number mismatch happening
just as we'd grabbed the reference, but in those cases we won't be stealing
the final mntput() from anything that would care.
* mntput_no_expire() doesn't lock anything on the fast path now.  Incidentally,
SMP and UP cases are handled the same way - no ifdefs there.
* normal pathname resolution does *not* do any writes to mount_lock.  It does,
of course, bump the refcounts of vfsmount and dentry in the very end, but that's
it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c           |  20 +++++---
 fs/mount.h            |  10 ++--
 fs/namei.c            |  50 ++++++++++---------
 fs/namespace.c        | 135 ++++++++++++++++++++++++++++++++------------------
 include/linux/mount.h |   2 +
 include/linux/namei.h |   2 +-
 6 files changed, 136 insertions(+), 83 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index eb0978da1bd4..aafa2a146434 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2887,24 +2887,28 @@ static int prepend_path(const struct path *path,
 	struct vfsmount *vfsmnt = path->mnt;
 	struct mount *mnt = real_mount(vfsmnt);
 	int error = 0;
-	unsigned seq = 0;
+	unsigned seq, m_seq = 0;
 	char *bptr;
 	int blen;
 
-	br_read_lock(&vfsmount_lock);
 	rcu_read_lock();
+restart_mnt:
+	read_seqbegin_or_lock(&mount_lock, &m_seq);
+	seq = 0;
 restart:
 	bptr = *buffer;
 	blen = *buflen;
+	error = 0;
 	read_seqbegin_or_lock(&rename_lock, &seq);
 	while (dentry != root->dentry || vfsmnt != root->mnt) {
 		struct dentry * parent;
 
 		if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
+			struct mount *parent = ACCESS_ONCE(mnt->mnt_parent);
 			/* Global root? */
-			if (mnt_has_parent(mnt)) {
-				dentry = mnt->mnt_mountpoint;
-				mnt = mnt->mnt_parent;
+			if (mnt != parent) {
+				dentry = ACCESS_ONCE(mnt->mnt_mountpoint);
+				mnt = parent;
 				vfsmnt = &mnt->mnt;
 				continue;
 			}
@@ -2938,7 +2942,11 @@ restart:
 		goto restart;
 	}
 	done_seqretry(&rename_lock, seq);
-	br_read_unlock(&vfsmount_lock);
+	if (need_seqretry(&mount_lock, m_seq)) {
+		m_seq = 1;
+		goto restart_mnt;
+	}
+	done_seqretry(&mount_lock, m_seq);
 
 	if (error >= 0 && bptr == *buffer) {
 		if (--blen < 0)
diff --git a/fs/mount.h b/fs/mount.h
index f0866076de6e..d64c594be6c4 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -1,7 +1,6 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/poll.h>
-#include <linux/lglock.h>
 
 struct mnt_namespace {
 	atomic_t		count;
@@ -30,6 +29,7 @@ struct mount {
 	struct mount *mnt_parent;
 	struct dentry *mnt_mountpoint;
 	struct vfsmount mnt;
+	struct rcu_head mnt_rcu;
 #ifdef CONFIG_SMP
 	struct mnt_pcp __percpu *mnt_pcp;
 #else
@@ -80,21 +80,23 @@ static inline int is_mounted(struct vfsmount *mnt)
 extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
 extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
 
+extern bool legitimize_mnt(struct vfsmount *, unsigned);
+
 static inline void get_mnt_ns(struct mnt_namespace *ns)
 {
 	atomic_inc(&ns->count);
 }
 
-extern struct lglock vfsmount_lock;
+extern seqlock_t mount_lock;
 
 static inline void lock_mount_hash(void)
 {
-	br_write_lock(&vfsmount_lock);
+	write_seqlock(&mount_lock);
 }
 
 static inline void unlock_mount_hash(void)
 {
-	br_write_unlock(&vfsmount_lock);
+	write_sequnlock(&mount_lock);
 }
 
 struct proc_mounts {
diff --git a/fs/namei.c b/fs/namei.c
index 1f844fbfce72..cb0ebae07e52 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -484,14 +484,12 @@ EXPORT_SYMBOL(path_put);
 
 static inline void lock_rcu_walk(void)
 {
-	br_read_lock(&vfsmount_lock);
 	rcu_read_lock();
 }
 
 static inline void unlock_rcu_walk(void)
 {
 	rcu_read_unlock();
-	br_read_unlock(&vfsmount_lock);
 }
 
 /**
@@ -512,26 +510,23 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
 	BUG_ON(!(nd->flags & LOOKUP_RCU));
 
 	/*
-	 * Get a reference to the parent first: we're
-	 * going to make "path_put(nd->path)" valid in
-	 * non-RCU context for "terminate_walk()".
-	 *
-	 * If this doesn't work, return immediately with
-	 * RCU walking still active (and then we will do
-	 * the RCU walk cleanup in terminate_walk()).
+	 * After legitimizing the bastards, terminate_walk()
+	 * will do the right thing for non-RCU mode, and all our
+	 * subsequent exit cases should rcu_read_unlock()
+	 * before returning.  Do vfsmount first; if dentry
+	 * can't be legitimized, just set nd->path.dentry to NULL
+	 * and rely on dput(NULL) being a no-op.
 	 */
-	if (!lockref_get_not_dead(&parent->d_lockref))
+	if (!legitimize_mnt(nd->path.mnt, nd->m_seq))
 		return -ECHILD;
-
-	/*
-	 * After the mntget(), we terminate_walk() will do
-	 * the right thing for non-RCU mode, and all our
-	 * subsequent exit cases should unlock_rcu_walk()
-	 * before returning.
-	 */
-	mntget(nd->path.mnt);
 	nd->flags &= ~LOOKUP_RCU;
 
+	if (!lockref_get_not_dead(&parent->d_lockref)) {
+		nd->path.dentry = NULL;	
+		unlock_rcu_walk();
+		return -ECHILD;
+	}
+
 	/*
 	 * For a negative lookup, the lookup sequence point is the parents
 	 * sequence point, and it only needs to revalidate the parent dentry.
@@ -608,16 +603,21 @@ static int complete_walk(struct nameidata *nd)
 		if (!(nd->flags & LOOKUP_ROOT))
 			nd->root.mnt = NULL;
 
+		if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) {
+			unlock_rcu_walk();
+			return -ECHILD;
+		}
 		if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
 			unlock_rcu_walk();
+			mntput(nd->path.mnt);
 			return -ECHILD;
 		}
 		if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
 			unlock_rcu_walk();
 			dput(dentry);
+			mntput(nd->path.mnt);
 			return -ECHILD;
 		}
-		mntget(nd->path.mnt);
 		unlock_rcu_walk();
 	}
 
@@ -909,15 +909,15 @@ int follow_up(struct path *path)
 	struct mount *parent;
 	struct dentry *mountpoint;
 
-	br_read_lock(&vfsmount_lock);
+	read_seqlock_excl(&mount_lock);
 	parent = mnt->mnt_parent;
 	if (parent == mnt) {
-		br_read_unlock(&vfsmount_lock);
+		read_sequnlock_excl(&mount_lock);
 		return 0;
 	}
 	mntget(&parent->mnt);
 	mountpoint = dget(mnt->mnt_mountpoint);
-	br_read_unlock(&vfsmount_lock);
+	read_sequnlock_excl(&mount_lock);
 	dput(path->dentry);
 	path->dentry = mountpoint;
 	mntput(path->mnt);
@@ -1048,8 +1048,8 @@ static int follow_managed(struct path *path, unsigned flags)
 
 			/* Something is mounted on this dentry in another
 			 * namespace and/or whatever was mounted there in this
-			 * namespace got unmounted before we managed to get the
-			 * vfsmount_lock */
+			 * namespace got unmounted before lookup_mnt() could
+			 * get it */
 		}
 
 		/* Handle an automount point */
@@ -1864,6 +1864,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 		if (flags & LOOKUP_RCU) {
 			lock_rcu_walk();
 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+			nd->m_seq = read_seqbegin(&mount_lock);
 		} else {
 			path_get(&nd->path);
 		}
@@ -1872,6 +1873,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 
 	nd->root.mnt = NULL;
 
+	nd->m_seq = read_seqbegin(&mount_lock);
 	if (*name=='/') {
 		if (flags & LOOKUP_RCU) {
 			lock_rcu_walk();
diff --git a/fs/namespace.c b/fs/namespace.c
index 500202ce10db..ac2ce8a766e1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -53,7 +53,7 @@ EXPORT_SYMBOL_GPL(fs_kobj);
  * It should be taken for write in all cases where the vfsmount
  * tree or hash is modified or when a vfsmount structure is modified.
  */
-DEFINE_BRLOCK(vfsmount_lock);
+__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
 
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 {
@@ -547,16 +547,38 @@ static void free_vfsmnt(struct mount *mnt)
 	kmem_cache_free(mnt_cache, mnt);
 }
 
+/* call under rcu_read_lock */
+bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
+{
+	struct mount *mnt;
+	if (read_seqretry(&mount_lock, seq))
+		return false;
+	if (bastard == NULL)
+		return true;
+	mnt = real_mount(bastard);
+	mnt_add_count(mnt, 1);
+	if (likely(!read_seqretry(&mount_lock, seq)))
+		return true;
+	if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
+		mnt_add_count(mnt, -1);
+		return false;
+	}
+	rcu_read_unlock();
+	mntput(bastard);
+	rcu_read_lock();
+	return false;
+}
+
 /*
  * find the first mount at @dentry on vfsmount @mnt.
- * vfsmount_lock must be held for read or write.
+ * call under rcu_read_lock()
  */
 struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 {
 	struct list_head *head = mount_hashtable + hash(mnt, dentry);
 	struct mount *p;
 
-	list_for_each_entry(p, head, mnt_hash)
+	list_for_each_entry_rcu(p, head, mnt_hash)
 		if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
 			return p;
 	return NULL;
@@ -564,7 +586,7 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 
 /*
  * find the last mount at @dentry on vfsmount @mnt.
- * vfsmount_lock must be held for read or write.
+ * mount_lock must be held.
  */
 struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 {
@@ -596,17 +618,17 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 struct vfsmount *lookup_mnt(struct path *path)
 {
 	struct mount *child_mnt;
+	struct vfsmount *m;
+	unsigned seq;
 
-	br_read_lock(&vfsmount_lock);
-	child_mnt = __lookup_mnt(path->mnt, path->dentry);
-	if (child_mnt) {
-		mnt_add_count(child_mnt, 1);
-		br_read_unlock(&vfsmount_lock);
-		return &child_mnt->mnt;
-	} else {
-		br_read_unlock(&vfsmount_lock);
-		return NULL;
-	}
+	rcu_read_lock();
+	do {
+		seq = read_seqbegin(&mount_lock);
+		child_mnt = __lookup_mnt(path->mnt, path->dentry);
+		m = child_mnt ? &child_mnt->mnt : NULL;
+	} while (!legitimize_mnt(m, seq));
+	rcu_read_unlock();
+	return m;
 }
 
 static struct mountpoint *new_mountpoint(struct dentry *dentry)
@@ -874,38 +896,46 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	return ERR_PTR(err);
 }
 
+static void delayed_free(struct rcu_head *head)
+{
+	struct mount *mnt = container_of(head, struct mount, mnt_rcu);
+	kfree(mnt->mnt_devname);
+#ifdef CONFIG_SMP
+	free_percpu(mnt->mnt_pcp);
+#endif
+	kmem_cache_free(mnt_cache, mnt);
+}
+
 static void mntput_no_expire(struct mount *mnt)
 {
 put_again:
-#ifdef CONFIG_SMP
-	br_read_lock(&vfsmount_lock);
-	if (likely(mnt->mnt_ns)) {
-		/* shouldn't be the last one */
-		mnt_add_count(mnt, -1);
-		br_read_unlock(&vfsmount_lock);
+	rcu_read_lock();
+	mnt_add_count(mnt, -1);
+	if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
+		rcu_read_unlock();
 		return;
 	}
-	br_read_unlock(&vfsmount_lock);
-
 	lock_mount_hash();
-	mnt_add_count(mnt, -1);
 	if (mnt_get_count(mnt)) {
+		rcu_read_unlock();
 		unlock_mount_hash();
 		return;
 	}
-#else
-	mnt_add_count(mnt, -1);
-	if (likely(mnt_get_count(mnt)))
-		return;
-	lock_mount_hash();
-#endif
 	if (unlikely(mnt->mnt_pinned)) {
 		mnt_add_count(mnt, mnt->mnt_pinned + 1);
 		mnt->mnt_pinned = 0;
+		rcu_read_unlock();
 		unlock_mount_hash();
 		acct_auto_close_mnt(&mnt->mnt);
 		goto put_again;
 	}
+	if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
+		rcu_read_unlock();
+		unlock_mount_hash();
+		return;
+	}
+	mnt->mnt.mnt_flags |= MNT_DOOMED;
+	rcu_read_unlock();
 
 	list_del(&mnt->mnt_instance);
 	unlock_mount_hash();
@@ -924,7 +954,8 @@ put_again:
 	fsnotify_vfsmount_delete(&mnt->mnt);
 	dput(mnt->mnt.mnt_root);
 	deactivate_super(mnt->mnt.mnt_sb);
-	free_vfsmnt(mnt);
+	mnt_free_id(mnt);
+	call_rcu(&mnt->mnt_rcu, delayed_free);
 }
 
 void mntput(struct vfsmount *mnt)
@@ -1137,6 +1168,8 @@ static void namespace_unlock(void)
 	list_splice_init(&unmounted, &head);
 	up_write(&namespace_sem);
 
+	synchronize_rcu();
+
 	while (!list_empty(&head)) {
 		mnt = list_first_entry(&head, struct mount, mnt_hash);
 		list_del_init(&mnt->mnt_hash);
@@ -1152,10 +1185,13 @@ static inline void namespace_lock(void)
 }
 
 /*
- * vfsmount lock must be held for write
+ * mount_lock must be held
  * namespace_sem must be held for write
+ * how = 0 => just this tree, don't propagate
+ * how = 1 => propagate; we know that nobody else has reference to any victims
+ * how = 2 => lazy umount
  */
-void umount_tree(struct mount *mnt, int propagate)
+void umount_tree(struct mount *mnt, int how)
 {
 	LIST_HEAD(tmp_list);
 	struct mount *p;
@@ -1163,7 +1199,7 @@ void umount_tree(struct mount *mnt, int propagate)
 	for (p = mnt; p; p = next_mnt(p, mnt))
 		list_move(&p->mnt_hash, &tmp_list);
 
-	if (propagate)
+	if (how)
 		propagate_umount(&tmp_list);
 
 	list_for_each_entry(p, &tmp_list, mnt_hash) {
@@ -1171,6 +1207,8 @@ void umount_tree(struct mount *mnt, int propagate)
 		list_del_init(&p->mnt_list);
 		__touch_mnt_namespace(p->mnt_ns);
 		p->mnt_ns = NULL;
+		if (how < 2)
+			p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
 		list_del_init(&p->mnt_child);
 		if (mnt_has_parent(p)) {
 			put_mountpoint(p->mnt_mp);
@@ -1262,14 +1300,18 @@ static int do_umount(struct mount *mnt, int flags)
 	lock_mount_hash();
 	event++;
 
-	if (!(flags & MNT_DETACH))
-		shrink_submounts(mnt);
-
-	retval = -EBUSY;
-	if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) {
+	if (flags & MNT_DETACH) {
 		if (!list_empty(&mnt->mnt_list))
-			umount_tree(mnt, 1);
+			umount_tree(mnt, 2);
 		retval = 0;
+	} else {
+		shrink_submounts(mnt);
+		retval = -EBUSY;
+		if (!propagate_mount_busy(mnt, 2)) {
+			if (!list_empty(&mnt->mnt_list))
+				umount_tree(mnt, 1);
+			retval = 0;
+		}
 	}
 	unlock_mount_hash();
 	namespace_unlock();
@@ -1955,7 +1997,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
 	struct mount *parent;
 	int err;
 
-	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
+	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT);
 
 	mp = lock_mount(path);
 	if (IS_ERR(mp))
@@ -2172,7 +2214,7 @@ resume:
  * process a list of expirable mountpoints with the intent of discarding any
  * submounts of a specific parent mountpoint
  *
- * vfsmount_lock must be held for write
+ * mount_lock must be held for write
  */
 static void shrink_submounts(struct mount *mnt)
 {
@@ -2558,7 +2600,7 @@ out_type:
 /*
  * Return true if path is reachable from root
  *
- * namespace_sem or vfsmount_lock is held
+ * namespace_sem or mount_lock is held
  */
 bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
 			 const struct path *root)
@@ -2573,9 +2615,9 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
 int path_is_under(struct path *path1, struct path *path2)
 {
 	int res;
-	br_read_lock(&vfsmount_lock);
+	read_seqlock_excl(&mount_lock);
 	res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
-	br_read_unlock(&vfsmount_lock);
+	read_sequnlock_excl(&mount_lock);
 	return res;
 }
 EXPORT_SYMBOL(path_is_under);
@@ -2748,8 +2790,6 @@ void __init mnt_init(void)
 	for (u = 0; u < HASH_SIZE; u++)
 		INIT_LIST_HEAD(&mountpoint_hashtable[u]);
 
-	br_lock_init(&vfsmount_lock);
-
 	err = sysfs_init();
 	if (err)
 		printk(KERN_WARNING "%s: sysfs_init error: %d\n",
@@ -2788,9 +2828,8 @@ void kern_unmount(struct vfsmount *mnt)
 {
 	/* release long term mount so mount point can be released */
 	if (!IS_ERR_OR_NULL(mnt)) {
-		lock_mount_hash();
 		real_mount(mnt)->mnt_ns = NULL;
-		unlock_mount_hash();
+		synchronize_rcu();	/* yecchhh... */
 		mntput(mnt);
 	}
 }
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 38cd98f112a0..371d346fa270 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -49,6 +49,8 @@ struct mnt_namespace;
 
 #define MNT_LOCK_READONLY	0x400000
 #define MNT_LOCKED		0x800000
+#define MNT_DOOMED		0x1000000
+#define MNT_SYNC_UMOUNT		0x2000000
 
 struct vfsmount {
 	struct dentry *mnt_root;	/* root of the mounted tree */
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 8e47bc7a1665..492de72560fa 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -16,7 +16,7 @@ struct nameidata {
 	struct path	root;
 	struct inode	*inode; /* path.dentry.d_inode */
 	unsigned int	flags;
-	unsigned	seq;
+	unsigned	seq, m_seq;
 	int		last_type;
 	unsigned	depth;
 	char *saved_names[MAX_NESTED_LINKS + 1];
-- 
cgit v1.3


From 8b61e74ffc6310e1d35a9b51c8463093851f8bcf Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 8 Nov 2013 12:45:01 -0500
Subject: get rid of {lock,unlock}_rcu_walk()

those have become aliases for rcu_read_{lock,unlock}()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 38 ++++++++++++++------------------------
 1 file changed, 14 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index cb0ebae07e52..e5c0118ba9f8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -482,16 +482,6 @@ EXPORT_SYMBOL(path_put);
  * to restart the path walk from the beginning in ref-walk mode.
  */
 
-static inline void lock_rcu_walk(void)
-{
-	rcu_read_lock();
-}
-
-static inline void unlock_rcu_walk(void)
-{
-	rcu_read_unlock();
-}
-
 /**
  * unlazy_walk - try to switch to ref-walk mode.
  * @nd: nameidata pathwalk data
@@ -523,7 +513,7 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
 
 	if (!lockref_get_not_dead(&parent->d_lockref)) {
 		nd->path.dentry = NULL;	
-		unlock_rcu_walk();
+		rcu_read_unlock();
 		return -ECHILD;
 	}
 
@@ -561,17 +551,17 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
 		spin_unlock(&fs->lock);
 	}
 
-	unlock_rcu_walk();
+	rcu_read_unlock();
 	return 0;
 
 unlock_and_drop_dentry:
 	spin_unlock(&fs->lock);
 drop_dentry:
-	unlock_rcu_walk();
+	rcu_read_unlock();
 	dput(dentry);
 	goto drop_root_mnt;
 out:
-	unlock_rcu_walk();
+	rcu_read_unlock();
 drop_root_mnt:
 	if (!(nd->flags & LOOKUP_ROOT))
 		nd->root.mnt = NULL;
@@ -604,21 +594,21 @@ static int complete_walk(struct nameidata *nd)
 			nd->root.mnt = NULL;
 
 		if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) {
-			unlock_rcu_walk();
+			rcu_read_unlock();
 			return -ECHILD;
 		}
 		if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
-			unlock_rcu_walk();
+			rcu_read_unlock();
 			mntput(nd->path.mnt);
 			return -ECHILD;
 		}
 		if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
-			unlock_rcu_walk();
+			rcu_read_unlock();
 			dput(dentry);
 			mntput(nd->path.mnt);
 			return -ECHILD;
 		}
-		unlock_rcu_walk();
+		rcu_read_unlock();
 	}
 
 	if (likely(!(nd->flags & LOOKUP_JUMPED)))
@@ -1174,7 +1164,7 @@ failed:
 	nd->flags &= ~LOOKUP_RCU;
 	if (!(nd->flags & LOOKUP_ROOT))
 		nd->root.mnt = NULL;
-	unlock_rcu_walk();
+	rcu_read_unlock();
 	return -ECHILD;
 }
 
@@ -1501,7 +1491,7 @@ static void terminate_walk(struct nameidata *nd)
 		nd->flags &= ~LOOKUP_RCU;
 		if (!(nd->flags & LOOKUP_ROOT))
 			nd->root.mnt = NULL;
-		unlock_rcu_walk();
+		rcu_read_unlock();
 	}
 }
 
@@ -1862,7 +1852,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 		nd->path = nd->root;
 		nd->inode = inode;
 		if (flags & LOOKUP_RCU) {
-			lock_rcu_walk();
+			rcu_read_lock();
 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
 			nd->m_seq = read_seqbegin(&mount_lock);
 		} else {
@@ -1876,7 +1866,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 	nd->m_seq = read_seqbegin(&mount_lock);
 	if (*name=='/') {
 		if (flags & LOOKUP_RCU) {
-			lock_rcu_walk();
+			rcu_read_lock();
 			set_root_rcu(nd);
 		} else {
 			set_root(nd);
@@ -1888,7 +1878,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 			struct fs_struct *fs = current->fs;
 			unsigned seq;
 
-			lock_rcu_walk();
+			rcu_read_lock();
 
 			do {
 				seq = read_seqcount_begin(&fs->seq);
@@ -1920,7 +1910,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 			if (f.need_put)
 				*fp = f.file;
 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-			lock_rcu_walk();
+			rcu_read_lock();
 		} else {
 			path_get(&nd->path);
 			fdput(f);
-- 
cgit v1.3


From eee5cc2702929fd41cce28058dc6d6717f723f87 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Oct 2013 11:06:42 -0400
Subject: get rid of s_files and files_lock

The only thing we need it for is alt-sysrq-r (emergency remount r/o)
and these days we can do just as well without going through the
list of files.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file_table.c    | 125 -----------------------------------------------------
 fs/internal.h      |   3 --
 fs/open.c          |   2 -
 fs/super.c         |  15 +------
 include/linux/fs.h |  13 ------
 5 files changed, 2 insertions(+), 156 deletions(-)

(limited to 'fs')

diff --git a/fs/file_table.c b/fs/file_table.c
index e61e5529fa9d..23b6dca03ba0 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -36,8 +36,6 @@ struct files_stat_struct files_stat = {
 	.max_files = NR_FILE
 };
 
-DEFINE_STATIC_LGLOCK(files_lglock);
-
 /* SLAB cache for file structures */
 static struct kmem_cache *filp_cachep __read_mostly;
 
@@ -134,7 +132,6 @@ struct file *get_empty_filp(void)
 		return ERR_PTR(error);
 	}
 
-	INIT_LIST_HEAD(&f->f_u.fu_list);
 	atomic_long_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
 	spin_lock_init(&f->f_lock);
@@ -304,7 +301,6 @@ void fput(struct file *file)
 	if (atomic_long_dec_and_test(&file->f_count)) {
 		struct task_struct *task = current;
 
-		file_sb_list_del(file);
 		if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
 			init_task_work(&file->f_u.fu_rcuhead, ____fput);
 			if (!task_work_add(task, &file->f_u.fu_rcuhead, true))
@@ -333,7 +329,6 @@ void __fput_sync(struct file *file)
 {
 	if (atomic_long_dec_and_test(&file->f_count)) {
 		struct task_struct *task = current;
-		file_sb_list_del(file);
 		BUG_ON(!(task->flags & PF_KTHREAD));
 		__fput(file);
 	}
@@ -345,129 +340,10 @@ void put_filp(struct file *file)
 {
 	if (atomic_long_dec_and_test(&file->f_count)) {
 		security_file_free(file);
-		file_sb_list_del(file);
 		file_free(file);
 	}
 }
 
-static inline int file_list_cpu(struct file *file)
-{
-#ifdef CONFIG_SMP
-	return file->f_sb_list_cpu;
-#else
-	return smp_processor_id();
-#endif
-}
-
-/* helper for file_sb_list_add to reduce ifdefs */
-static inline void __file_sb_list_add(struct file *file, struct super_block *sb)
-{
-	struct list_head *list;
-#ifdef CONFIG_SMP
-	int cpu;
-	cpu = smp_processor_id();
-	file->f_sb_list_cpu = cpu;
-	list = per_cpu_ptr(sb->s_files, cpu);
-#else
-	list = &sb->s_files;
-#endif
-	list_add(&file->f_u.fu_list, list);
-}
-
-/**
- * file_sb_list_add - add a file to the sb's file list
- * @file: file to add
- * @sb: sb to add it to
- *
- * Use this function to associate a file with the superblock of the inode it
- * refers to.
- */
-void file_sb_list_add(struct file *file, struct super_block *sb)
-{
-	if (likely(!(file->f_mode & FMODE_WRITE)))
-		return;
-	if (!S_ISREG(file_inode(file)->i_mode))
-		return;
-	lg_local_lock(&files_lglock);
-	__file_sb_list_add(file, sb);
-	lg_local_unlock(&files_lglock);
-}
-
-/**
- * file_sb_list_del - remove a file from the sb's file list
- * @file: file to remove
- * @sb: sb to remove it from
- *
- * Use this function to remove a file from its superblock.
- */
-void file_sb_list_del(struct file *file)
-{
-	if (!list_empty(&file->f_u.fu_list)) {
-		lg_local_lock_cpu(&files_lglock, file_list_cpu(file));
-		list_del_init(&file->f_u.fu_list);
-		lg_local_unlock_cpu(&files_lglock, file_list_cpu(file));
-	}
-}
-
-#ifdef CONFIG_SMP
-
-/*
- * These macros iterate all files on all CPUs for a given superblock.
- * files_lglock must be held globally.
- */
-#define do_file_list_for_each_entry(__sb, __file)		\
-{								\
-	int i;							\
-	for_each_possible_cpu(i) {				\
-		struct list_head *list;				\
-		list = per_cpu_ptr((__sb)->s_files, i);		\
-		list_for_each_entry((__file), list, f_u.fu_list)
-
-#define while_file_list_for_each_entry				\
-	}							\
-}
-
-#else
-
-#define do_file_list_for_each_entry(__sb, __file)		\
-{								\
-	struct list_head *list;					\
-	list = &(sb)->s_files;					\
-	list_for_each_entry((__file), list, f_u.fu_list)
-
-#define while_file_list_for_each_entry				\
-}
-
-#endif
-
-/**
- *	mark_files_ro - mark all files read-only
- *	@sb: superblock in question
- *
- *	All files are marked read-only.  We don't care about pending
- *	delete files so this should be used in 'force' mode only.
- */
-void mark_files_ro(struct super_block *sb)
-{
-	struct file *f;
-
-	lg_global_lock(&files_lglock);
-	do_file_list_for_each_entry(sb, f) {
-		if (!file_count(f))
-			continue;
-		if (!(f->f_mode & FMODE_WRITE))
-			continue;
-		spin_lock(&f->f_lock);
-		f->f_mode &= ~FMODE_WRITE;
-		spin_unlock(&f->f_lock);
-		if (file_check_writeable(f) != 0)
-			continue;
-		__mnt_drop_write(f->f_path.mnt);
-		file_release_write(f);
-	} while_file_list_for_each_entry;
-	lg_global_unlock(&files_lglock);
-}
-
 void __init files_init(unsigned long mempages)
 { 
 	unsigned long n;
@@ -483,6 +359,5 @@ void __init files_init(unsigned long mempages)
 	n = (mempages * (PAGE_SIZE / 1024)) / 10;
 	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
 	files_defer_init();
-	lg_lock_init(&files_lglock, "files_lglock");
 	percpu_counter_init(&nr_files, 0);
 } 
diff --git a/fs/internal.h b/fs/internal.h
index 4a11e75ce14d..465742407466 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -73,9 +73,6 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
 /*
  * file_table.c
  */
-extern void file_sb_list_add(struct file *f, struct super_block *sb);
-extern void file_sb_list_del(struct file *f);
-extern void mark_files_ro(struct super_block *);
 extern struct file *get_empty_filp(void);
 
 /*
diff --git a/fs/open.c b/fs/open.c
index a1465b1ec8c7..fffbed40dbe9 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -685,7 +685,6 @@ static int do_dentry_open(struct file *f,
 	}
 
 	f->f_mapping = inode->i_mapping;
-	file_sb_list_add(f, inode->i_sb);
 
 	if (unlikely(f->f_mode & FMODE_PATH)) {
 		f->f_op = &empty_fops;
@@ -724,7 +723,6 @@ static int do_dentry_open(struct file *f,
 
 cleanup_all:
 	fops_put(f->f_op);
-	file_sb_list_del(f);
 	if (f->f_mode & FMODE_WRITE) {
 		put_write_access(inode);
 		if (!special_file(inode->i_mode)) {
diff --git a/fs/super.c b/fs/super.c
index 743bb7118053..e5f6c2cfac38 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -140,9 +140,6 @@ static void destroy_super(struct super_block *s)
 	int i;
 	list_lru_destroy(&s->s_dentry_lru);
 	list_lru_destroy(&s->s_inode_lru);
-#ifdef CONFIG_SMP
-	free_percpu(s->s_files);
-#endif
 	for (i = 0; i < SB_FREEZE_LEVELS; i++)
 		percpu_counter_destroy(&s->s_writers.counter[i]);
 	security_sb_free(s);
@@ -172,15 +169,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 	if (security_sb_alloc(s))
 		goto fail;
 
-#ifdef CONFIG_SMP
-	s->s_files = alloc_percpu(struct list_head);
-	if (!s->s_files)
-		goto fail;
-	for_each_possible_cpu(i)
-		INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i));
-#else
-	INIT_LIST_HEAD(&s->s_files);
-#endif
 	for (i = 0; i < SB_FREEZE_LEVELS; i++) {
 		if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0)
 			goto fail;
@@ -722,7 +710,8 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	   make sure there are no rw files opened */
 	if (remount_ro) {
 		if (force) {
-			mark_files_ro(sb);
+			sb->s_readonly_remount = 1;
+			smp_wmb();
 		} else {
 			retval = sb_prepare_remount_readonly(sb);
 			if (retval)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2ab8a67ee054..2b0f4e974480 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -764,12 +764,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
 #define FILE_MNT_WRITE_RELEASED	2
 
 struct file {
-	/*
-	 * fu_list becomes invalid after file_free is called and queued via
-	 * fu_rcuhead for RCU freeing
-	 */
 	union {
-		struct list_head	fu_list;
 		struct llist_node	fu_llist;
 		struct rcu_head 	fu_rcuhead;
 	} f_u;
@@ -783,9 +778,6 @@ struct file {
 	 * Must not be taken from IRQ context.
 	 */
 	spinlock_t		f_lock;
-#ifdef CONFIG_SMP
-	int			f_sb_list_cpu;
-#endif
 	atomic_long_t		f_count;
 	unsigned int 		f_flags;
 	fmode_t			f_mode;
@@ -1264,11 +1256,6 @@ struct super_block {
 
 	struct list_head	s_inodes;	/* all inodes */
 	struct hlist_bl_head	s_anon;		/* anonymous dentries for (nfs) exporting */
-#ifdef CONFIG_SMP
-	struct list_head __percpu *s_files;
-#else
-	struct list_head	s_files;
-#endif
 	struct list_head	s_mounts;	/* list of mounts; _not_ for fs use */
 	struct block_device	*s_bdev;
 	struct backing_dev_info *s_bdi;
-- 
cgit v1.3


From b61625d24596ea44555943867d5a5c1efd81074c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Oct 2013 11:09:01 -0400
Subject: fold __d_shrink() into its only remaining caller

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 32 ++++++++++----------------------
 1 file changed, 10 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index aafa2a146434..fb7bcf3ba5d6 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -483,27 +483,6 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
 	return parent;
 }
 
-/*
- * Unhash a dentry without inserting an RCU walk barrier or checking that
- * dentry->d_lock is locked.  The caller must take care of that, if
- * appropriate.
- */
-static void __d_shrink(struct dentry *dentry)
-{
-	if (!d_unhashed(dentry)) {
-		struct hlist_bl_head *b;
-		if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
-			b = &dentry->d_sb->s_anon;
-		else
-			b = d_hash(dentry->d_parent, dentry->d_name.hash);
-
-		hlist_bl_lock(b);
-		__hlist_bl_del(&dentry->d_hash);
-		dentry->d_hash.pprev = NULL;
-		hlist_bl_unlock(b);
-	}
-}
-
 /**
  * d_drop - drop a dentry
  * @dentry: dentry to drop
@@ -522,7 +501,16 @@ static void __d_shrink(struct dentry *dentry)
 void __d_drop(struct dentry *dentry)
 {
 	if (!d_unhashed(dentry)) {
-		__d_shrink(dentry);
+		struct hlist_bl_head *b;
+		if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
+			b = &dentry->d_sb->s_anon;
+		else
+			b = d_hash(dentry->d_parent, dentry->d_name.hash);
+
+		hlist_bl_lock(b);
+		__hlist_bl_del(&dentry->d_hash);
+		dentry->d_hash.pprev = NULL;
+		hlist_bl_unlock(b);
 		dentry_rcuwalk_barrier(dentry);
 	}
 }
-- 
cgit v1.3


From 11d100d9a291508be5a5bae1ddf92ea2ba9b8f0c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Oct 2013 18:17:02 -0400
Subject: coda_revalidate_inode(): switch to passing inode...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/coda/coda_linux.h | 2 +-
 fs/coda/dir.c        | 3 +--
 fs/coda/inode.c      | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index cc0ea9fe5ecf..e7550cb9fb74 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -40,7 +40,7 @@ extern const struct file_operations coda_ioctl_operations;
 int coda_open(struct inode *i, struct file *f);
 int coda_release(struct inode *i, struct file *f);
 int coda_permission(struct inode *inode, int mask);
-int coda_revalidate_inode(struct dentry *);
+int coda_revalidate_inode(struct inode *);
 int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 int coda_setattr(struct dentry *, struct iattr *);
 
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 8c8d469c9283..5efbb5ee0adc 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -563,13 +563,12 @@ static int coda_dentry_delete(const struct dentry * dentry)
  * cache manager Venus issues a downcall to the kernel when this 
  * happens 
  */
-int coda_revalidate_inode(struct dentry *dentry)
+int coda_revalidate_inode(struct inode *inode)
 {
 	struct coda_vattr attr;
 	int error;
 	int old_mode;
 	ino_t old_ino;
-	struct inode *inode = dentry->d_inode;
 	struct coda_inode_info *cii = ITOC(inode);
 
 	if (!cii->c_flags)
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 4dcc0d81a7aa..506de34a4ef3 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -257,7 +257,7 @@ static void coda_evict_inode(struct inode *inode)
 
 int coda_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
-	int err = coda_revalidate_inode(dentry);
+	int err = coda_revalidate_inode(dentry->d_inode);
 	if (!err)
 		generic_fillattr(dentry->d_inode, stat);
 	return err;
-- 
cgit v1.3


From ecc8c7725e6c21528329b34acae2a1d64b3af89b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 5 Oct 2013 15:32:35 -0400
Subject: new helper: dump_emit()

dump_write() analog, takes core_dump_params instead of file,
keeps track of the amount written in cprm->written and checks for
cprm->limit.  Start using it in binfmt_elf.c...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/binfmt_elf.c          | 60 +++++++++++++++++++-----------------------------
 fs/coredump.c            | 14 +++++++++++
 include/linux/binfmts.h  |  1 +
 include/linux/coredump.h |  2 ++
 4 files changed, 40 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 501c8a4d6eb1..00fd9c969a27 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1225,35 +1225,23 @@ static int notesize(struct memelfnote *en)
 	return sz;
 }
 
-#define DUMP_WRITE(addr, nr, foffset)	\
-	do { if (!dump_write(file, (addr), (nr))) return 0; *foffset += (nr); } while(0)
-
-static int alignfile(struct file *file, loff_t *foffset)
+static int alignfile(struct coredump_params *cprm)
 {
 	static const char buf[4] = { 0, };
-	DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset);
-	return 1;
+	return dump_emit(cprm, buf, roundup(cprm->written, 4) - cprm->written);
 }
 
-static int writenote(struct memelfnote *men, struct file *file,
-			loff_t *foffset)
+static int writenote(struct memelfnote *men, struct coredump_params *cprm)
 {
 	struct elf_note en;
 	en.n_namesz = strlen(men->name) + 1;
 	en.n_descsz = men->datasz;
 	en.n_type = men->type;
 
-	DUMP_WRITE(&en, sizeof(en), foffset);
-	DUMP_WRITE(men->name, en.n_namesz, foffset);
-	if (!alignfile(file, foffset))
-		return 0;
-	DUMP_WRITE(men->data, men->datasz, foffset);
-	if (!alignfile(file, foffset))
-		return 0;
-
-	return 1;
+	return dump_emit(cprm, &en, sizeof(en)) &&
+	    dump_emit(cprm, men->name, en.n_namesz) && alignfile(cprm) &&
+	    dump_emit(cprm, men->data, men->datasz) && alignfile(cprm);
 }
-#undef DUMP_WRITE
 
 static void fill_elf_header(struct elfhdr *elf, int segs,
 			    u16 machine, u32 flags)
@@ -1702,7 +1690,7 @@ static size_t get_note_info_size(struct elf_note_info *info)
  * process-wide notes are interleaved after the first thread-specific note.
  */
 static int write_note_info(struct elf_note_info *info,
-			   struct file *file, loff_t *foffset)
+			   struct coredump_params *cprm)
 {
 	bool first = 1;
 	struct elf_thread_core_info *t = info->thread;
@@ -1710,22 +1698,22 @@ static int write_note_info(struct elf_note_info *info,
 	do {
 		int i;
 
-		if (!writenote(&t->notes[0], file, foffset))
+		if (!writenote(&t->notes[0], cprm))
 			return 0;
 
-		if (first && !writenote(&info->psinfo, file, foffset))
+		if (first && !writenote(&info->psinfo, cprm))
 			return 0;
-		if (first && !writenote(&info->signote, file, foffset))
+		if (first && !writenote(&info->signote, cprm))
 			return 0;
-		if (first && !writenote(&info->auxv, file, foffset))
+		if (first && !writenote(&info->auxv, cprm))
 			return 0;
 		if (first && info->files.data &&
-				!writenote(&info->files, file, foffset))
+				!writenote(&info->files, cprm))
 			return 0;
 
 		for (i = 1; i < info->thread_notes; ++i)
 			if (t->notes[i].data &&
-			    !writenote(&t->notes[i], file, foffset))
+			    !writenote(&t->notes[i], cprm))
 				return 0;
 
 		first = 0;
@@ -1935,13 +1923,13 @@ static size_t get_note_info_size(struct elf_note_info *info)
 }
 
 static int write_note_info(struct elf_note_info *info,
-			   struct file *file, loff_t *foffset)
+			   struct coredump_params *cprm)
 {
 	int i;
 	struct list_head *t;
 
 	for (i = 0; i < info->numnote; i++)
-		if (!writenote(info->notes + i, file, foffset))
+		if (!writenote(info->notes + i, cprm))
 			return 0;
 
 	/* write out the thread status notes section */
@@ -1950,7 +1938,7 @@ static int write_note_info(struct elf_note_info *info,
 				list_entry(t, struct elf_thread_status, list);
 
 		for (i = 0; i < tmp->num_notes; i++)
-			if (!writenote(&tmp->notes[i], file, foffset))
+			if (!writenote(&tmp->notes[i], cprm))
 				return 0;
 	}
 
@@ -2136,13 +2124,10 @@ static int elf_core_dump(struct coredump_params *cprm)
 
 	offset = dataoff;
 
-	size += sizeof(*elf);
-	if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf)))
+	if (!dump_emit(cprm, elf, sizeof(*elf)))
 		goto end_coredump;
 
-	size += sizeof(*phdr4note);
-	if (size > cprm->limit
-	    || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note)))
+	if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note)))
 		goto end_coredump;
 
 	/* Write program headers for segments dump */
@@ -2164,19 +2149,20 @@ static int elf_core_dump(struct coredump_params *cprm)
 			phdr.p_flags |= PF_X;
 		phdr.p_align = ELF_EXEC_PAGESIZE;
 
-		size += sizeof(phdr);
-		if (size > cprm->limit
-		    || !dump_write(cprm->file, &phdr, sizeof(phdr)))
+		if (!dump_emit(cprm, &phdr, sizeof(phdr)))
 			goto end_coredump;
 	}
+	size = cprm->written;
 
 	if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit))
 		goto end_coredump;
 
+	cprm->written = foffset;	/* will disappear */
  	/* write out the notes section */
-	if (!write_note_info(&info, cprm->file, &foffset))
+	if (!write_note_info(&info, cprm))
 		goto end_coredump;
 
+	foffset = cprm->written;
 	if (elf_coredump_extra_notes_write(cprm->file, &foffset))
 		goto end_coredump;
 
diff --git a/fs/coredump.c b/fs/coredump.c
index 42c3b8423669..319f973bab72 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -693,6 +693,20 @@ int dump_write(struct file *file, const void *addr, int nr)
 }
 EXPORT_SYMBOL(dump_write);
 
+int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
+{
+	struct file *file = cprm->file;
+	if (dump_interrupted() || !access_ok(VERIFY_READ, addr, nr))
+		return 0;
+	if (cprm->written + nr > cprm->limit)
+		return 0;
+	if (file->f_op->write(file, addr, nr, &file->f_pos) != nr)
+		return 0;
+	cprm->written += nr;
+	return 1;
+}
+EXPORT_SYMBOL(dump_emit);
+
 int dump_seek(struct file *file, loff_t off)
 {
 	int ret = 1;
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index e8112ae50531..8aa507e7a41a 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -61,6 +61,7 @@ struct coredump_params {
 	struct file *file;
 	unsigned long limit;
 	unsigned long mm_flags;
+	loff_t written;
 };
 
 /*
diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index a98f1ca60407..2959376a9ad5 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -10,8 +10,10 @@
  * These are the only things you should do on a core-file: use only these
  * functions to write out all the necessary info.
  */
+struct coredump_params;
 extern int dump_write(struct file *file, const void *addr, int nr);
 extern int dump_seek(struct file *file, loff_t off);
+extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr);
 #ifdef CONFIG_COREDUMP
 extern void do_coredump(siginfo_t *siginfo);
 #else
-- 
cgit v1.3


From 506f21c556c747bb07b893f146220ec45cda381b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 5 Oct 2013 17:22:57 -0400
Subject: switch elf_core_write_extra_phdrs() to dump_emit()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/ia64/kernel/elfcore.c |  6 ++----
 arch/x86/um/elfcore.c      |  7 ++-----
 fs/binfmt_elf.c            |  4 ++--
 fs/binfmt_elf_fdpic.c      |  4 +++-
 include/linux/elfcore.h    |  5 +++--
 kernel/elfcore.c           | 10 +++-------
 6 files changed, 15 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/arch/ia64/kernel/elfcore.c b/arch/ia64/kernel/elfcore.c
index bac1639bc320..798ce543da4b 100644
--- a/arch/ia64/kernel/elfcore.c
+++ b/arch/ia64/kernel/elfcore.c
@@ -11,8 +11,7 @@ Elf64_Half elf_core_extra_phdrs(void)
 	return GATE_EHDR->e_phnum;
 }
 
-int elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
-			       unsigned long limit)
+int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
 {
 	const struct elf_phdr *const gate_phdrs =
 		(const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff);
@@ -35,8 +34,7 @@ int elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
 			phdr.p_offset += ofs;
 		}
 		phdr.p_paddr = 0; /* match other core phdrs */
-		*size += sizeof(phdr);
-		if (*size > limit || !dump_write(file, &phdr, sizeof(phdr)))
+		if (!dump_emit(cprm, &phdr, sizeof(phdr)))
 			return 0;
 	}
 	return 1;
diff --git a/arch/x86/um/elfcore.c b/arch/x86/um/elfcore.c
index 6bb49b687c97..fc21f98efafe 100644
--- a/arch/x86/um/elfcore.c
+++ b/arch/x86/um/elfcore.c
@@ -11,8 +11,7 @@ Elf32_Half elf_core_extra_phdrs(void)
 	return vsyscall_ehdr ? (((struct elfhdr *)vsyscall_ehdr)->e_phnum) : 0;
 }
 
-int elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
-			       unsigned long limit)
+int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
 {
 	if ( vsyscall_ehdr ) {
 		const struct elfhdr *const ehdrp =
@@ -32,9 +31,7 @@ int elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
 				phdr.p_offset += ofs;
 			}
 			phdr.p_paddr = 0; /* match other core phdrs */
-			*size += sizeof(phdr);
-			if (*size > limit
-			    || !dump_write(file, &phdr, sizeof(phdr)))
+			if (!dump_emit(cprm, &phdr, sizeof(phdr)))
 				return 0;
 		}
 	}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 00fd9c969a27..35c4886dae2d 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2152,11 +2152,11 @@ static int elf_core_dump(struct coredump_params *cprm)
 		if (!dump_emit(cprm, &phdr, sizeof(phdr)))
 			goto end_coredump;
 	}
-	size = cprm->written;
 
-	if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit))
+	if (!elf_core_write_extra_phdrs(cprm, offset))
 		goto end_coredump;
 
+	size = cprm->written;
 	cprm->written = foffset;	/* will disappear */
  	/* write out the notes section */
 	if (!write_note_info(&info, cprm))
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index ea4c6273b4a5..44db8b92121a 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1791,9 +1791,11 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 			goto end_coredump;
 	}
 
-	if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit))
+	cprm->written = size;
+	if (!elf_core_write_extra_phdrs(cprm, offset))
 		goto end_coredump;
 
+	size = cprm->written;
  	/* write out the notes section */
 	for (i = 0; i < numnote; i++)
 		if (!writenote(notes + i, cprm->file, &foffset))
diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h
index cdd3d13efce7..1b92a8c40624 100644
--- a/include/linux/elfcore.h
+++ b/include/linux/elfcore.h
@@ -6,6 +6,8 @@
 #include <asm/elf.h>
 #include <uapi/linux/elfcore.h>
 
+struct coredump_params;
+
 static inline void elf_core_copy_regs(elf_gregset_t *elfregs, struct pt_regs *regs)
 {
 #ifdef ELF_CORE_COPY_REGS
@@ -63,8 +65,7 @@ static inline int elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregse
  */
 extern Elf_Half elf_core_extra_phdrs(void);
 extern int
-elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
-			   unsigned long limit);
+elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset);
 extern int
 elf_core_write_extra_data(struct file *file, size_t *size, unsigned long limit);
 extern size_t elf_core_extra_data_size(void);
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
index ff915efef66d..e556751d15d9 100644
--- a/kernel/elfcore.c
+++ b/kernel/elfcore.c
@@ -1,23 +1,19 @@
 #include <linux/elf.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
-
-#include <asm/elf.h>
-
+#include <linux/binfmts.h>
 
 Elf_Half __weak elf_core_extra_phdrs(void)
 {
 	return 0;
 }
 
-int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
-				      unsigned long limit)
+int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
 {
 	return 1;
 }
 
-int __weak elf_core_write_extra_data(struct file *file, size_t *size,
-				     unsigned long limit)
+int __weak elf_core_write_extra_data(struct coredump_params *cprm)
 {
 	return 1;
 }
-- 
cgit v1.3


From aa3e7eaf0a0f06edd2b733e84e7e8ffe108e8786 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 5 Oct 2013 17:50:15 -0400
Subject: switch elf_core_write_extra_data() to dump_emit()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/ia64/kernel/elfcore.c | 6 ++----
 arch/x86/um/elfcore.c      | 8 ++------
 fs/binfmt_elf.c            | 4 +++-
 fs/binfmt_elf_fdpic.c      | 4 +++-
 include/linux/elfcore.h    | 2 +-
 5 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/arch/ia64/kernel/elfcore.c b/arch/ia64/kernel/elfcore.c
index 798ce543da4b..04bc8fd5f893 100644
--- a/arch/ia64/kernel/elfcore.c
+++ b/arch/ia64/kernel/elfcore.c
@@ -40,8 +40,7 @@ int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
 	return 1;
 }
 
-int elf_core_write_extra_data(struct file *file, size_t *size,
-			      unsigned long limit)
+int elf_core_write_extra_data(struct coredump_params *cprm)
 {
 	const struct elf_phdr *const gate_phdrs =
 		(const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff);
@@ -52,8 +51,7 @@ int elf_core_write_extra_data(struct file *file, size_t *size,
 			void *addr = (void *)gate_phdrs[i].p_vaddr;
 			size_t memsz = PAGE_ALIGN(gate_phdrs[i].p_memsz);
 
-			*size += memsz;
-			if (*size > limit || !dump_write(file, addr, memsz))
+			if (!dump_emit(cprm, addr, memsz))
 				return 0;
 			break;
 		}
diff --git a/arch/x86/um/elfcore.c b/arch/x86/um/elfcore.c
index fc21f98efafe..7bb89a27a5e4 100644
--- a/arch/x86/um/elfcore.c
+++ b/arch/x86/um/elfcore.c
@@ -38,8 +38,7 @@ int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
 	return 1;
 }
 
-int elf_core_write_extra_data(struct file *file, size_t *size,
-			      unsigned long limit)
+int elf_core_write_extra_data(struct coredump_params *cprm)
 {
 	if ( vsyscall_ehdr ) {
 		const struct elfhdr *const ehdrp =
@@ -52,10 +51,7 @@ int elf_core_write_extra_data(struct file *file, size_t *size,
 			if (phdrp[i].p_type == PT_LOAD) {
 				void *addr = (void *) phdrp[i].p_vaddr;
 				size_t filesz = phdrp[i].p_filesz;
-
-				*size += filesz;
-				if (*size > limit
-				    || !dump_write(file, addr, filesz))
+				if (!dump_emit(cprm, addr, filesz))
 					return 0;
 			}
 		}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 35c4886dae2d..a998b221b53a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2196,8 +2196,10 @@ static int elf_core_dump(struct coredump_params *cprm)
 		}
 	}
 
-	if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit))
+	cprm->written = size;
+	if (!elf_core_write_extra_data(cprm))
 		goto end_coredump;
+	size = cprm->written;
 
 	if (e_phnum == PN_XNUM) {
 		size += sizeof(*shdr4extnum);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 44db8b92121a..77bf7e33e706 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1818,8 +1818,10 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 				    cprm->mm_flags) < 0)
 		goto end_coredump;
 
-	if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit))
+	cprm->written = size;
+	if (!elf_core_write_extra_data(cprm))
 		goto end_coredump;
+	size = cprm->written;
 
 	if (e_phnum == PN_XNUM) {
 		size += sizeof(*shdr4extnum);
diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h
index 1b92a8c40624..698d51a0eea3 100644
--- a/include/linux/elfcore.h
+++ b/include/linux/elfcore.h
@@ -67,7 +67,7 @@ extern Elf_Half elf_core_extra_phdrs(void);
 extern int
 elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset);
 extern int
-elf_core_write_extra_data(struct file *file, size_t *size, unsigned long limit);
+elf_core_write_extra_data(struct coredump_params *cprm);
 extern size_t elf_core_extra_data_size(void);
 
 #endif /* _LINUX_ELFCORE_H */
-- 
cgit v1.3


From 13046ece9625f96fafb0cff0b9b95a586b53b553 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 5 Oct 2013 18:08:47 -0400
Subject: binfmt_elf: convert writing actual dump pages to dump_emit()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/binfmt_elf.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index a998b221b53a..ddab40d3d828 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2093,7 +2093,6 @@ static int elf_core_dump(struct coredump_params *cprm)
 
 	offset += sizeof(*elf);				/* Elf header */
 	offset += segs * sizeof(struct elf_phdr);	/* Program headers */
-	foffset = offset;
 
 	/* Write notes phdr entry */
 	{
@@ -2157,7 +2156,6 @@ static int elf_core_dump(struct coredump_params *cprm)
 		goto end_coredump;
 
 	size = cprm->written;
-	cprm->written = foffset;	/* will disappear */
  	/* write out the notes section */
 	if (!write_note_info(&info, cprm))
 		goto end_coredump;
@@ -2170,6 +2168,7 @@ static int elf_core_dump(struct coredump_params *cprm)
 	if (!dump_seek(cprm->file, dataoff - foffset))
 		goto end_coredump;
 
+	cprm->written = size;
 	for (vma = first_vma(current, gate_vma); vma != NULL;
 			vma = next_vma(vma, gate_vma)) {
 		unsigned long addr;
@@ -2184,9 +2183,7 @@ static int elf_core_dump(struct coredump_params *cprm)
 			page = get_dump_page(addr);
 			if (page) {
 				void *kaddr = kmap(page);
-				stop = ((size += PAGE_SIZE) > cprm->limit) ||
-					!dump_write(cprm->file, kaddr,
-						    PAGE_SIZE);
+				stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
 				kunmap(page);
 				page_cache_release(page);
 			} else
@@ -2196,16 +2193,11 @@ static int elf_core_dump(struct coredump_params *cprm)
 		}
 	}
 
-	cprm->written = size;
 	if (!elf_core_write_extra_data(cprm))
 		goto end_coredump;
-	size = cprm->written;
 
 	if (e_phnum == PN_XNUM) {
-		size += sizeof(*shdr4extnum);
-		if (size > cprm->limit
-		    || !dump_write(cprm->file, shdr4extnum,
-				   sizeof(*shdr4extnum)))
+		if (!dump_emit(cprm, shdr4extnum, sizeof(*shdr4extnum)))
 			goto end_coredump;
 	}
 
-- 
cgit v1.3


From e6c1baa9b562ca296d57178c44f3894795d13d32 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 5 Oct 2013 18:58:47 -0400
Subject: convert the rest of binfmt_elf_fdpic to dump_emit()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/binfmt_elf_fdpic.c | 110 ++++++++++++++------------------------------------
 1 file changed, 31 insertions(+), 79 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 77bf7e33e706..70e299917898 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1267,35 +1267,23 @@ static int notesize(struct memelfnote *en)
 
 /* #define DEBUG */
 
-#define DUMP_WRITE(addr, nr, foffset)	\
-	do { if (!dump_write(file, (addr), (nr))) return 0; *foffset += (nr); } while(0)
-
-static int alignfile(struct file *file, loff_t *foffset)
+static int alignfile(struct coredump_params *cprm)
 {
 	static const char buf[4] = { 0, };
-	DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset);
-	return 1;
+	return dump_emit(cprm, buf, roundup(cprm->written, 4) - cprm->written);
 }
 
-static int writenote(struct memelfnote *men, struct file *file,
-			loff_t *foffset)
+static int writenote(struct memelfnote *men, struct coredump_params *cprm)
 {
 	struct elf_note en;
 	en.n_namesz = strlen(men->name) + 1;
 	en.n_descsz = men->datasz;
 	en.n_type = men->type;
 
-	DUMP_WRITE(&en, sizeof(en), foffset);
-	DUMP_WRITE(men->name, en.n_namesz, foffset);
-	if (!alignfile(file, foffset))
-		return 0;
-	DUMP_WRITE(men->data, men->datasz, foffset);
-	if (!alignfile(file, foffset))
-		return 0;
-
-	return 1;
+	return dump_emit(cprm, &en, sizeof(en)) &&
+		dump_emit(cprm, men->name, en.n_namesz) && alignfile(cprm) &&
+		dump_emit(cprm, men->data, men->datasz) && alignfile(cprm);
 }
-#undef DUMP_WRITE
 
 static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs)
 {
@@ -1500,66 +1488,40 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
 /*
  * dump the segments for an MMU process
  */
-#ifdef CONFIG_MMU
-static int elf_fdpic_dump_segments(struct file *file, size_t *size,
-			   unsigned long *limit, unsigned long mm_flags)
+static bool elf_fdpic_dump_segments(struct coredump_params *cprm)
 {
 	struct vm_area_struct *vma;
-	int err = 0;
 
 	for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
 		unsigned long addr;
 
-		if (!maydump(vma, mm_flags))
+		if (!maydump(vma, cprm->mm_flags))
 			continue;
 
+#ifdef CONFIG_MMU
 		for (addr = vma->vm_start; addr < vma->vm_end;
 							addr += PAGE_SIZE) {
+			bool res;
 			struct page *page = get_dump_page(addr);
 			if (page) {
 				void *kaddr = kmap(page);
-				*size += PAGE_SIZE;
-				if (*size > *limit)
-					err = -EFBIG;
-				else if (!dump_write(file, kaddr, PAGE_SIZE))
-					err = -EIO;
+				res = dump_emit(cprm, kaddr, PAGE_SIZE);
 				kunmap(page);
 				page_cache_release(page);
-			} else if (!dump_seek(file, PAGE_SIZE))
-				err = -EFBIG;
-			if (err)
-				goto out;
+			} else {
+				res = dump_seek(file, PAGE_SIZE);
+			}
+			if (!res)
+				return false;
 		}
-	}
-out:
-	return err;
-}
-#endif
-
-/*
- * dump the segments for a NOMMU process
- */
-#ifndef CONFIG_MMU
-static int elf_fdpic_dump_segments(struct file *file, size_t *size,
-			   unsigned long *limit, unsigned long mm_flags)
-{
-	struct vm_area_struct *vma;
-
-	for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
-		if (!maydump(vma, mm_flags))
-			continue;
-
-		if ((*size += PAGE_SIZE) > *limit)
-			return -EFBIG;
-
-		if (!dump_write(file, (void *) vma->vm_start,
+#else
+		if (!dump_emit(cprm, (void *) vma->vm_start,
 				vma->vm_end - vma->vm_start))
-			return -EIO;
+			return false;
+#endif
 	}
-
-	return 0;
+	return true;
 }
-#endif
 
 static size_t elf_core_vma_data_size(unsigned long mm_flags)
 {
@@ -1755,13 +1717,10 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 
 	offset = dataoff;
 
-	size += sizeof(*elf);
-	if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf)))
+	if (!dump_emit(cprm, elf, sizeof(*elf)))
 		goto end_coredump;
 
-	size += sizeof(*phdr4note);
-	if (size > cprm->limit
-	    || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note)))
+	if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note)))
 		goto end_coredump;
 
 	/* write program headers for segments dump */
@@ -1785,20 +1744,18 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 			phdr.p_flags |= PF_X;
 		phdr.p_align = ELF_EXEC_PAGESIZE;
 
-		size += sizeof(phdr);
-		if (size > cprm->limit
-		    || !dump_write(cprm->file, &phdr, sizeof(phdr)))
+		if (!dump_emit(cprm, &phdr, sizeof(phdr)))
 			goto end_coredump;
 	}
 
-	cprm->written = size;
 	if (!elf_core_write_extra_phdrs(cprm, offset))
 		goto end_coredump;
 
 	size = cprm->written;
+	cprm->written = foffset;
  	/* write out the notes section */
 	for (i = 0; i < numnote; i++)
-		if (!writenote(notes + i, cprm->file, &foffset))
+		if (!writenote(notes + i, cprm))
 			goto end_coredump;
 
 	/* write out the thread status notes section */
@@ -1807,27 +1764,22 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 				list_entry(t, struct elf_thread_status, list);
 
 		for (i = 0; i < tmp->num_notes; i++)
-			if (!writenote(&tmp->notes[i], cprm->file, &foffset))
+			if (!writenote(&tmp->notes[i], cprm))
 				goto end_coredump;
 	}
 
-	if (!dump_seek(cprm->file, dataoff - foffset))
+	if (!dump_seek(cprm->file, dataoff - cprm->written))
 		goto end_coredump;
 
-	if (elf_fdpic_dump_segments(cprm->file, &size, &cprm->limit,
-				    cprm->mm_flags) < 0)
+	cprm->written = size;
+	if (!elf_fdpic_dump_segments(cprm))
 		goto end_coredump;
 
-	cprm->written = size;
 	if (!elf_core_write_extra_data(cprm))
 		goto end_coredump;
-	size = cprm->written;
 
 	if (e_phnum == PN_XNUM) {
-		size += sizeof(*shdr4extnum);
-		if (size > cprm->limit
-		    || !dump_write(cprm->file, shdr4extnum,
-				   sizeof(*shdr4extnum)))
+		if (!dump_emit(cprm, shdr4extnum, sizeof(*shdr4extnum)))
 			goto end_coredump;
 	}
 
-- 
cgit v1.3


From cdc3d5627d5f7c4e6b6372b9fb39cba0fe6a9b2a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 5 Oct 2013 22:24:29 -0400
Subject: switch elf_coredump_extra_notes_write() to dump_emit()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/powerpc/include/asm/spu.h               |  3 +-
 arch/powerpc/platforms/cell/spu_syscalls.c   |  5 ++--
 arch/powerpc/platforms/cell/spufs/coredump.c | 44 +++++++++++-----------------
 arch/powerpc/platforms/cell/spufs/spufs.h    |  3 +-
 fs/binfmt_elf.c                              |  7 ++---
 include/linux/elf.h                          |  6 ++--
 6 files changed, 30 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/arch/powerpc/include/asm/spu.h b/arch/powerpc/include/asm/spu.h
index 93f280e23279..37b7ca39ec9f 100644
--- a/arch/powerpc/include/asm/spu.h
+++ b/arch/powerpc/include/asm/spu.h
@@ -235,6 +235,7 @@ extern long spu_sys_callback(struct spu_syscall_block *s);
 
 /* syscalls implemented in spufs */
 struct file;
+struct coredump_params;
 struct spufs_calls {
 	long (*create_thread)(const char __user *name,
 					unsigned int flags, umode_t mode,
@@ -242,7 +243,7 @@ struct spufs_calls {
 	long (*spu_run)(struct file *filp, __u32 __user *unpc,
 						__u32 __user *ustatus);
 	int (*coredump_extra_notes_size)(void);
-	int (*coredump_extra_notes_write)(struct file *file, loff_t *foffset);
+	int (*coredump_extra_notes_write)(struct coredump_params *cprm);
 	void (*notify_spus_active)(void);
 	struct module *owner;
 };
diff --git a/arch/powerpc/platforms/cell/spu_syscalls.c b/arch/powerpc/platforms/cell/spu_syscalls.c
index db4e638cf408..3844f1397fc3 100644
--- a/arch/powerpc/platforms/cell/spu_syscalls.c
+++ b/arch/powerpc/platforms/cell/spu_syscalls.c
@@ -25,6 +25,7 @@
 #include <linux/module.h>
 #include <linux/syscalls.h>
 #include <linux/rcupdate.h>
+#include <linux/binfmts.h>
 
 #include <asm/spu.h>
 
@@ -126,7 +127,7 @@ int elf_coredump_extra_notes_size(void)
 	return ret;
 }
 
-int elf_coredump_extra_notes_write(struct file *file, loff_t *foffset)
+int elf_coredump_extra_notes_write(struct coredump_params *cprm)
 {
 	struct spufs_calls *calls;
 	int ret;
@@ -135,7 +136,7 @@ int elf_coredump_extra_notes_write(struct file *file, loff_t *foffset)
 	if (!calls)
 		return 0;
 
-	ret = calls->coredump_extra_notes_write(file, foffset);
+	ret = calls->coredump_extra_notes_write(cprm);
 
 	spufs_calls_put(calls);
 
diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index c9500ea7be2f..5d9b0a288f36 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -27,6 +27,8 @@
 #include <linux/gfp.h>
 #include <linux/list.h>
 #include <linux/syscalls.h>
+#include <linux/coredump.h>
+#include <linux/binfmts.h>
 
 #include <asm/uaccess.h>
 
@@ -52,35 +54,24 @@ static ssize_t do_coredump_read(int num, struct spu_context *ctx, void *buffer,
  * These are the only things you should do on a core-file: use only these
  * functions to write out all the necessary info.
  */
-static int spufs_dump_write(struct file *file, const void *addr, int nr, loff_t *foffset)
+static int spufs_dump_write(struct coredump_params *cprm, const void *addr, int nr)
 {
-	unsigned long limit = rlimit(RLIMIT_CORE);
-	ssize_t written;
-
-	if (*foffset + nr > limit)
+	if (!dump_emit(cprm, addr, nr))
 		return -EIO;
-
-	written = file->f_op->write(file, addr, nr, &file->f_pos);
-	*foffset += written;
-
-	if (written != nr)
-		return -EIO;
-
 	return 0;
 }
 
-static int spufs_dump_align(struct file *file, char *buf, loff_t new_off,
-			    loff_t *foffset)
+static int spufs_dump_align(struct coredump_params *cprm, char *buf, loff_t new_off)
 {
 	int rc, size;
 
-	size = min((loff_t)PAGE_SIZE, new_off - *foffset);
+	size = min((loff_t)PAGE_SIZE, new_off - cprm->written);
 	memset(buf, 0, size);
 
 	rc = 0;
-	while (rc == 0 && new_off > *foffset) {
-		size = min((loff_t)PAGE_SIZE, new_off - *foffset);
-		rc = spufs_dump_write(file, buf, size, foffset);
+	while (rc == 0 && new_off > cprm->written) {
+		size = min((loff_t)PAGE_SIZE, new_off - cprm->written);
+		rc = spufs_dump_write(cprm, buf, size);
 	}
 
 	return rc;
@@ -165,7 +156,7 @@ int spufs_coredump_extra_notes_size(void)
 }
 
 static int spufs_arch_write_note(struct spu_context *ctx, int i,
-				  struct file *file, int dfd, loff_t *foffset)
+				  struct coredump_params *cprm, int dfd)
 {
 	loff_t pos = 0;
 	int sz, rc, nread, total = 0;
@@ -186,22 +177,22 @@ static int spufs_arch_write_note(struct spu_context *ctx, int i,
 	en.n_descsz = sz;
 	en.n_type = NT_SPU;
 
-	rc = spufs_dump_write(file, &en, sizeof(en), foffset);
+	rc = spufs_dump_write(cprm, &en, sizeof(en));
 	if (rc)
 		goto out;
 
-	rc = spufs_dump_write(file, fullname, en.n_namesz, foffset);
+	rc = spufs_dump_write(cprm, fullname, en.n_namesz);
 	if (rc)
 		goto out;
 
-	rc = spufs_dump_align(file, buf, roundup(*foffset, 4), foffset);
+	rc = spufs_dump_align(cprm, buf, roundup(cprm->written, 4));
 	if (rc)
 		goto out;
 
 	do {
 		nread = do_coredump_read(i, ctx, buf, bufsz, &pos);
 		if (nread > 0) {
-			rc = spufs_dump_write(file, buf, nread, foffset);
+			rc = spufs_dump_write(cprm, buf, nread);
 			if (rc)
 				goto out;
 			total += nread;
@@ -213,15 +204,14 @@ static int spufs_arch_write_note(struct spu_context *ctx, int i,
 		goto out;
 	}
 
-	rc = spufs_dump_align(file, buf, roundup(*foffset - total + sz, 4),
-			      foffset);
+	rc = spufs_dump_align(cprm, buf, roundup(cprm->written - total + sz, 4));
 
 out:
 	free_page((unsigned long)buf);
 	return rc;
 }
 
-int spufs_coredump_extra_notes_write(struct file *file, loff_t *foffset)
+int spufs_coredump_extra_notes_write(struct coredump_params *cprm)
 {
 	struct spu_context *ctx;
 	int fd, j, rc;
@@ -233,7 +223,7 @@ int spufs_coredump_extra_notes_write(struct file *file, loff_t *foffset)
 			return rc;
 
 		for (j = 0; spufs_coredump_read[j].name != NULL; j++) {
-			rc = spufs_arch_write_note(ctx, j, file, fd, foffset);
+			rc = spufs_arch_write_note(ctx, j, cprm, fd);
 			if (rc) {
 				spu_release_saved(ctx);
 				return rc;
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
index 67852ade4c01..0ba3c9598358 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -247,12 +247,13 @@ extern const struct spufs_tree_descr spufs_dir_debug_contents[];
 
 /* system call implementation */
 extern struct spufs_calls spufs_calls;
+struct coredump_params;
 long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *status);
 long spufs_create(struct path *nd, struct dentry *dentry, unsigned int flags,
 			umode_t mode, struct file *filp);
 /* ELF coredump callbacks for writing SPU ELF notes */
 extern int spufs_coredump_extra_notes_size(void);
-extern int spufs_coredump_extra_notes_write(struct file *file, loff_t *foffset);
+extern int spufs_coredump_extra_notes_write(struct coredump_params *cprm);
 
 extern const struct file_operations spufs_context_fops;
 
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index ddab40d3d828..3bf75d767a4a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2037,7 +2037,7 @@ static int elf_core_dump(struct coredump_params *cprm)
 	size_t size = 0;
 	struct vm_area_struct *vma, *gate_vma;
 	struct elfhdr *elf = NULL;
-	loff_t offset = 0, dataoff, foffset;
+	loff_t offset = 0, dataoff;
 	struct elf_note_info info = { };
 	struct elf_phdr *phdr4note = NULL;
 	struct elf_shdr *shdr4extnum = NULL;
@@ -2160,12 +2160,11 @@ static int elf_core_dump(struct coredump_params *cprm)
 	if (!write_note_info(&info, cprm))
 		goto end_coredump;
 
-	foffset = cprm->written;
-	if (elf_coredump_extra_notes_write(cprm->file, &foffset))
+	if (elf_coredump_extra_notes_write(cprm))
 		goto end_coredump;
 
 	/* Align to page */
-	if (!dump_seek(cprm->file, dataoff - foffset))
+	if (!dump_seek(cprm->file, dataoff - cprm->written))
 		goto end_coredump;
 
 	cprm->written = size;
diff --git a/include/linux/elf.h b/include/linux/elf.h
index 40a3c0e01b2b..67a5fa7830c4 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -39,13 +39,13 @@ extern Elf64_Dyn _DYNAMIC [];
 
 /* Optional callbacks to write extra ELF notes. */
 struct file;
+struct coredump_params;
 
 #ifndef ARCH_HAVE_EXTRA_ELF_NOTES
 static inline int elf_coredump_extra_notes_size(void) { return 0; }
-static inline int elf_coredump_extra_notes_write(struct file *file,
-			loff_t *foffset) { return 0; }
+static inline int elf_coredump_extra_notes_write(struct coredump_params *cprm) { return 0; }
 #else
 extern int elf_coredump_extra_notes_size(void);
-extern int elf_coredump_extra_notes_write(struct file *file, loff_t *foffset);
+extern int elf_coredump_extra_notes_write(struct coredump_params *cprm);
 #endif
 #endif /* _LINUX_ELF_H */
-- 
cgit v1.3


From 43a5d548eb5949580c1d96b9f3fa9f4c0bee0b98 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 7 Oct 2013 07:22:01 -0400
Subject: aout: switch to dump_emit

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/ia32/ia32_aout.c | 20 ++++++++------------
 fs/binfmt_aout.c          |  7 +++----
 2 files changed, 11 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 80361c00fb64..9e26e9e85576 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -133,14 +133,6 @@ static void set_brk(unsigned long start, unsigned long end)
 
 #include <linux/coredump.h>
 
-#define DUMP_WRITE(addr, nr)			     \
-	if (!dump_write(cprm->file, (void *)(addr), (nr))) \
-		goto end_coredump;
-
-#define DUMP_SEEK(offset)		\
-	if (!dump_seek(cprm->file, offset))	\
-		goto end_coredump;
-
 #define START_DATA(u)	(u.u_tsize << PAGE_SHIFT)
 #define START_STACK(u)	(u.start_stack)
 
@@ -192,22 +184,26 @@ static int aout_core_dump(struct coredump_params *cprm)
 
 	set_fs(KERNEL_DS);
 	/* struct user */
-	DUMP_WRITE(&dump, sizeof(dump));
+	if (!dump_emit(cprm, &dump, sizeof(dump)))
+		goto end_coredump;
 	/* Now dump all of the user data.  Include malloced stuff as well */
-	DUMP_SEEK(PAGE_SIZE - sizeof(dump));
+	if (!dump_seek(cprm->file, PAGE_SIZE - sizeof(dump)))
+		goto end_coredump;
 	/* now we start writing out the user space info */
 	set_fs(USER_DS);
 	/* Dump the data area */
 	if (dump.u_dsize != 0) {
 		dump_start = START_DATA(dump);
 		dump_size = dump.u_dsize << PAGE_SHIFT;
-		DUMP_WRITE(dump_start, dump_size);
+		if (!dump_emit(cprm, (void *)dump_start, dump_size))
+			goto end_coredump;
 	}
 	/* Now prepare to dump the stack area */
 	if (dump.u_ssize != 0) {
 		dump_start = START_STACK(dump);
 		dump_size = dump.u_ssize << PAGE_SHIFT;
-		DUMP_WRITE(dump_start, dump_size);
+		if (!dump_emit(cprm, (void *)dump_start, dump_size))
+			goto end_coredump;
 	}
 end_coredump:
 	set_fs(fs);
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index faaa819f70fa..a4f847f77234 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -45,7 +45,6 @@ static int load_aout_library(struct file*);
  */
 static int aout_core_dump(struct coredump_params *cprm)
 {
-	struct file *file = cprm->file;
 	mm_segment_t fs;
 	int has_dumped = 0;
 	void __user *dump_start;
@@ -85,7 +84,7 @@ static int aout_core_dump(struct coredump_params *cprm)
 
 	set_fs(KERNEL_DS);
 /* struct user */
-	if (!dump_write(file, &dump, sizeof(dump)))
+	if (!dump_emit(cprm, &dump, sizeof(dump)))
 		goto end_coredump;
 /* Now dump all of the user data.  Include malloced stuff as well */
 	if (!dump_seek(cprm->file, PAGE_SIZE - sizeof(dump)))
@@ -96,14 +95,14 @@ static int aout_core_dump(struct coredump_params *cprm)
 	if (dump.u_dsize != 0) {
 		dump_start = START_DATA(dump);
 		dump_size = dump.u_dsize << PAGE_SHIFT;
-		if (!dump_write(file, dump_start, dump_size))
+		if (!dump_emit(cprm, dump_start, dump_size))
 			goto end_coredump;
 	}
 /* Now prepare to dump the stack area */
 	if (dump.u_ssize != 0) {
 		dump_start = START_STACK(dump);
 		dump_size = dump.u_ssize << PAGE_SHIFT;
-		if (!dump_write(file, dump_start, dump_size))
+		if (!dump_emit(cprm, dump_start, dump_size))
 			goto end_coredump;
 	}
 end_coredump:
-- 
cgit v1.3


From 1ad67015e619ba4e0b15ed0482d464292fedf263 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 7 Oct 2013 07:23:45 -0400
Subject: binfmt_elf: count notes towards coredump limit

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/binfmt_elf.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 3bf75d767a4a..4f7dda9d86b5 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2034,7 +2034,6 @@ static int elf_core_dump(struct coredump_params *cprm)
 	int has_dumped = 0;
 	mm_segment_t fs;
 	int segs;
-	size_t size = 0;
 	struct vm_area_struct *vma, *gate_vma;
 	struct elfhdr *elf = NULL;
 	loff_t offset = 0, dataoff;
@@ -2155,7 +2154,6 @@ static int elf_core_dump(struct coredump_params *cprm)
 	if (!elf_core_write_extra_phdrs(cprm, offset))
 		goto end_coredump;
 
-	size = cprm->written;
  	/* write out the notes section */
 	if (!write_note_info(&info, cprm))
 		goto end_coredump;
@@ -2167,7 +2165,6 @@ static int elf_core_dump(struct coredump_params *cprm)
 	if (!dump_seek(cprm->file, dataoff - cprm->written))
 		goto end_coredump;
 
-	cprm->written = size;
 	for (vma = first_vma(current, gate_vma); vma != NULL;
 			vma = next_vma(vma, gate_vma)) {
 		unsigned long addr;
-- 
cgit v1.3


From 2507a4fbd48a96bc4236e584252635f8539079df Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 8 Oct 2013 09:11:48 -0400
Subject: make dump_emit() use vfs_write() instead of banging at ->f_op->write
 directly

... and deal with short writes properly - the output might be to pipe, after
all; as it is, e.g. no-MMU case of elf_fdpic coredump can write a whole lot
more than a page worth of data at one call.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/coredump.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/coredump.c b/fs/coredump.c
index 319f973bab72..2472ed9e682c 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -696,13 +696,20 @@ EXPORT_SYMBOL(dump_write);
 int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
 {
 	struct file *file = cprm->file;
-	if (dump_interrupted() || !access_ok(VERIFY_READ, addr, nr))
-		return 0;
+	loff_t pos = file->f_pos;
+	ssize_t n;
 	if (cprm->written + nr > cprm->limit)
 		return 0;
-	if (file->f_op->write(file, addr, nr, &file->f_pos) != nr)
-		return 0;
-	cprm->written += nr;
+	while (nr) {
+		if (dump_interrupted())
+			return 0;
+		n = vfs_write(file, addr, nr, &pos);
+		if (n <= 0)
+			return 0;
+		file->f_pos = pos;
+		cprm->written += n;
+		nr -= n;
+	}
 	return 1;
 }
 EXPORT_SYMBOL(dump_emit);
-- 
cgit v1.3


From 9b56d54380adb5fef71f687109bbd6f8413d694f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 8 Oct 2013 09:26:08 -0400
Subject: dump_skip(): dump_seek() replacement taking coredump_params

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/ia32/ia32_aout.c |  2 +-
 fs/binfmt_aout.c          |  2 +-
 fs/binfmt_elf.c           |  4 ++--
 fs/binfmt_elf_fdpic.c     | 11 +++--------
 fs/coredump.c             | 43 ++++++++++++++-----------------------------
 include/linux/coredump.h  |  3 +--
 6 files changed, 22 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 9e26e9e85576..d21ff89207cd 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -187,7 +187,7 @@ static int aout_core_dump(struct coredump_params *cprm)
 	if (!dump_emit(cprm, &dump, sizeof(dump)))
 		goto end_coredump;
 	/* Now dump all of the user data.  Include malloced stuff as well */
-	if (!dump_seek(cprm->file, PAGE_SIZE - sizeof(dump)))
+	if (!dump_skip(cprm, PAGE_SIZE - sizeof(dump)))
 		goto end_coredump;
 	/* now we start writing out the user space info */
 	set_fs(USER_DS);
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index a4f847f77234..ca0ba15a7306 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -87,7 +87,7 @@ static int aout_core_dump(struct coredump_params *cprm)
 	if (!dump_emit(cprm, &dump, sizeof(dump)))
 		goto end_coredump;
 /* Now dump all of the user data.  Include malloced stuff as well */
-	if (!dump_seek(cprm->file, PAGE_SIZE - sizeof(dump)))
+	if (!dump_skip(cprm, PAGE_SIZE - sizeof(dump)))
 		goto end_coredump;
 /* now we start writing out the user space info */
 	set_fs(USER_DS);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 4f7dda9d86b5..c56ae3264a65 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2162,7 +2162,7 @@ static int elf_core_dump(struct coredump_params *cprm)
 		goto end_coredump;
 
 	/* Align to page */
-	if (!dump_seek(cprm->file, dataoff - cprm->written))
+	if (!dump_skip(cprm, dataoff - cprm->written))
 		goto end_coredump;
 
 	for (vma = first_vma(current, gate_vma); vma != NULL;
@@ -2183,7 +2183,7 @@ static int elf_core_dump(struct coredump_params *cprm)
 				kunmap(page);
 				page_cache_release(page);
 			} else
-				stop = !dump_seek(cprm->file, PAGE_SIZE);
+				stop = !dump_skip(cprm, PAGE_SIZE);
 			if (stop)
 				goto end_coredump;
 		}
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 70e299917898..a69fc4ae1c85 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1509,7 +1509,7 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm)
 				kunmap(page);
 				page_cache_release(page);
 			} else {
-				res = dump_seek(file, PAGE_SIZE);
+				res = dump_skip(cprm, PAGE_SIZE);
 			}
 			if (!res)
 				return false;
@@ -1547,11 +1547,10 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 	int has_dumped = 0;
 	mm_segment_t fs;
 	int segs;
-	size_t size = 0;
 	int i;
 	struct vm_area_struct *vma;
 	struct elfhdr *elf = NULL;
-	loff_t offset = 0, dataoff, foffset;
+	loff_t offset = 0, dataoff;
 	int numnote;
 	struct memelfnote *notes = NULL;
 	struct elf_prstatus *prstatus = NULL;	/* NT_PRSTATUS */
@@ -1682,7 +1681,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 
 	offset += sizeof(*elf);				/* Elf header */
 	offset += segs * sizeof(struct elf_phdr);	/* Program headers */
-	foffset = offset;
 
 	/* Write notes phdr entry */
 	{
@@ -1751,8 +1749,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 	if (!elf_core_write_extra_phdrs(cprm, offset))
 		goto end_coredump;
 
-	size = cprm->written;
-	cprm->written = foffset;
  	/* write out the notes section */
 	for (i = 0; i < numnote; i++)
 		if (!writenote(notes + i, cprm))
@@ -1768,10 +1764,9 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 				goto end_coredump;
 	}
 
-	if (!dump_seek(cprm->file, dataoff - cprm->written))
+	if (!dump_skip(cprm, dataoff - cprm->written))
 		goto end_coredump;
 
-	cprm->written = size;
 	if (!elf_fdpic_dump_segments(cprm))
 		goto end_coredump;
 
diff --git a/fs/coredump.c b/fs/coredump.c
index 2472ed9e682c..18baf2c009d4 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -685,14 +685,6 @@ fail:
  * do on a core-file: use only these functions to write out all the
  * necessary info.
  */
-int dump_write(struct file *file, const void *addr, int nr)
-{
-	return !dump_interrupted() &&
-		access_ok(VERIFY_READ, addr, nr) &&
-		file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-}
-EXPORT_SYMBOL(dump_write);
-
 int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
 {
 	struct file *file = cprm->file;
@@ -714,32 +706,25 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
 }
 EXPORT_SYMBOL(dump_emit);
 
-int dump_seek(struct file *file, loff_t off)
+int dump_skip(struct coredump_params *cprm, size_t nr)
 {
-	int ret = 1;
-
+	static char zeroes[PAGE_SIZE];
+	struct file *file = cprm->file;
 	if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
+		if (cprm->written + nr > cprm->limit)
+			return 0;
 		if (dump_interrupted() ||
-		    file->f_op->llseek(file, off, SEEK_CUR) < 0)
+		    file->f_op->llseek(file, nr, SEEK_CUR) < 0)
 			return 0;
+		cprm->written += nr;
+		return 1;
 	} else {
-		char *buf = (char *)get_zeroed_page(GFP_KERNEL);
-
-		if (!buf)
-			return 0;
-		while (off > 0) {
-			unsigned long n = off;
-
-			if (n > PAGE_SIZE)
-				n = PAGE_SIZE;
-			if (!dump_write(file, buf, n)) {
-				ret = 0;
-				break;
-			}
-			off -= n;
+		while (nr > PAGE_SIZE) {
+			if (!dump_emit(cprm, zeroes, PAGE_SIZE))
+				return 0;
+			nr -= PAGE_SIZE;
 		}
-		free_page((unsigned long)buf);
+		return dump_emit(cprm, zeroes, nr);
 	}
-	return ret;
 }
-EXPORT_SYMBOL(dump_seek);
+EXPORT_SYMBOL(dump_skip);
diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index 2959376a9ad5..07a0af93f230 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -11,8 +11,7 @@
  * functions to write out all the necessary info.
  */
 struct coredump_params;
-extern int dump_write(struct file *file, const void *addr, int nr);
-extern int dump_seek(struct file *file, loff_t off);
+extern int dump_skip(struct coredump_params *cprm, size_t nr);
 extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr);
 #ifdef CONFIG_COREDUMP
 extern void do_coredump(siginfo_t *siginfo);
-- 
cgit v1.3


From 22a8cb8248ba5d340307ba72432253b1dbdb5cf7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 8 Oct 2013 11:05:01 -0400
Subject: new helper: dump_align()

dump_skip to given alignment...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/powerpc/platforms/cell/spufs/coredump.c |  2 +-
 fs/binfmt_elf.c                              | 10 ++--------
 fs/binfmt_elf_fdpic.c                        | 10 ++--------
 fs/coredump.c                                |  9 +++++++++
 include/linux/coredump.h                     |  1 +
 5 files changed, 15 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index 1aaaa586fa50..be6212ddbf06 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -156,7 +156,7 @@ static int spufs_arch_write_note(struct spu_context *ctx, int i,
 	if (!dump_emit(cprm, fullname, en.n_namesz))
 		goto Eio;
 
-	if (!dump_skip(cprm, roundup(cprm->written, 4) - cprm->written))
+	if (!dump_align(cprm, 4))
 		goto Eio;
 
 	do {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index c56ae3264a65..864154972670 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1225,12 +1225,6 @@ static int notesize(struct memelfnote *en)
 	return sz;
 }
 
-static int alignfile(struct coredump_params *cprm)
-{
-	static const char buf[4] = { 0, };
-	return dump_emit(cprm, buf, roundup(cprm->written, 4) - cprm->written);
-}
-
 static int writenote(struct memelfnote *men, struct coredump_params *cprm)
 {
 	struct elf_note en;
@@ -1239,8 +1233,8 @@ static int writenote(struct memelfnote *men, struct coredump_params *cprm)
 	en.n_type = men->type;
 
 	return dump_emit(cprm, &en, sizeof(en)) &&
-	    dump_emit(cprm, men->name, en.n_namesz) && alignfile(cprm) &&
-	    dump_emit(cprm, men->data, men->datasz) && alignfile(cprm);
+	    dump_emit(cprm, men->name, en.n_namesz) && dump_align(cprm, 4) &&
+	    dump_emit(cprm, men->data, men->datasz) && dump_align(cprm, 4);
 }
 
 static void fill_elf_header(struct elfhdr *elf, int segs,
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index a69fc4ae1c85..645f6e56f378 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1267,12 +1267,6 @@ static int notesize(struct memelfnote *en)
 
 /* #define DEBUG */
 
-static int alignfile(struct coredump_params *cprm)
-{
-	static const char buf[4] = { 0, };
-	return dump_emit(cprm, buf, roundup(cprm->written, 4) - cprm->written);
-}
-
 static int writenote(struct memelfnote *men, struct coredump_params *cprm)
 {
 	struct elf_note en;
@@ -1281,8 +1275,8 @@ static int writenote(struct memelfnote *men, struct coredump_params *cprm)
 	en.n_type = men->type;
 
 	return dump_emit(cprm, &en, sizeof(en)) &&
-		dump_emit(cprm, men->name, en.n_namesz) && alignfile(cprm) &&
-		dump_emit(cprm, men->data, men->datasz) && alignfile(cprm);
+		dump_emit(cprm, men->name, en.n_namesz) && dump_align(cprm, 4) &&
+		dump_emit(cprm, men->data, men->datasz) && dump_align(cprm, 4);
 }
 
 static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs)
diff --git a/fs/coredump.c b/fs/coredump.c
index 18baf2c009d4..dc1f937413d5 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -728,3 +728,12 @@ int dump_skip(struct coredump_params *cprm, size_t nr)
 	}
 }
 EXPORT_SYMBOL(dump_skip);
+
+int dump_align(struct coredump_params *cprm, int align)
+{
+	unsigned mod = cprm->written & (align - 1);
+	if (align & (align - 1))
+		return -EINVAL;
+	return mod ? dump_skip(cprm, align - mod) : 0;
+}
+EXPORT_SYMBOL(dump_align);
diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index 07a0af93f230..d8eb880be82a 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -13,6 +13,7 @@
 struct coredump_params;
 extern int dump_skip(struct coredump_params *cprm, size_t nr);
 extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr);
+extern int dump_align(struct coredump_params *cprm, int align);
 #ifdef CONFIG_COREDUMP
 extern void do_coredump(siginfo_t *siginfo);
 #else
-- 
cgit v1.3


From 6987843ff7e836ea65b554905aec34d2fad05c94 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Oct 2013 22:35:11 -0400
Subject: take anon inode allocation to libfs.c

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/anon_inodes.c   | 50 ++------------------------------------------------
 fs/libfs.c         | 43 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  1 +
 3 files changed, 46 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 85c961849953..22f9698a1214 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -24,7 +24,6 @@
 
 static struct vfsmount *anon_inode_mnt __read_mostly;
 static struct inode *anon_inode_inode;
-static const struct file_operations anon_inode_fops;
 
 /*
  * anon_inodefs_dname() is called from d_path().
@@ -39,51 +38,6 @@ static const struct dentry_operations anon_inodefs_dentry_operations = {
 	.d_dname	= anon_inodefs_dname,
 };
 
-/*
- * nop .set_page_dirty method so that people can use .page_mkwrite on
- * anon inodes.
- */
-static int anon_set_page_dirty(struct page *page)
-{
-	return 0;
-};
-
-static const struct address_space_operations anon_aops = {
-	.set_page_dirty = anon_set_page_dirty,
-};
-
-/*
- * A single inode exists for all anon_inode files. Contrary to pipes,
- * anon_inode inodes have no associated per-instance data, so we need
- * only allocate one of them.
- */
-static struct inode *anon_inode_mkinode(struct super_block *s)
-{
-	struct inode *inode = new_inode_pseudo(s);
-
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
-
-	inode->i_ino = get_next_ino();
-	inode->i_fop = &anon_inode_fops;
-
-	inode->i_mapping->a_ops = &anon_aops;
-
-	/*
-	 * Mark the inode dirty from the very beginning,
-	 * that way it will never be moved to the dirty
-	 * list because mark_inode_dirty() will think
-	 * that it already _is_ on the dirty list.
-	 */
-	inode->i_state = I_DIRTY;
-	inode->i_mode = S_IRUSR | S_IWUSR;
-	inode->i_uid = current_fsuid();
-	inode->i_gid = current_fsgid();
-	inode->i_flags |= S_PRIVATE;
-	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-	return inode;
-}
-
 static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
 				int flags, const char *dev_name, void *data)
 {
@@ -92,7 +46,7 @@ static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
 			&anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC);
 	if (!IS_ERR(root)) {
 		struct super_block *s = root->d_sb;
-		anon_inode_inode = anon_inode_mkinode(s);
+		anon_inode_inode = alloc_anon_inode(s);
 		if (IS_ERR(anon_inode_inode)) {
 			dput(root);
 			deactivate_locked_super(s);
@@ -134,7 +88,7 @@ struct file *anon_inode_getfile_private(const char *name,
 	if (fops->owner && !try_module_get(fops->owner))
 		return ERR_PTR(-ENOENT);
 
-	inode = anon_inode_mkinode(anon_inode_mnt->mnt_sb);
+	inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
 	if (IS_ERR(inode)) {
 		file = ERR_PTR(-ENOMEM);
 		goto err_module;
diff --git a/fs/libfs.c b/fs/libfs.c
index 8c5018493154..5de06947ba5e 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1002,3 +1002,46 @@ void kfree_put_link(struct dentry *dentry, struct nameidata *nd,
 		kfree(s);
 }
 EXPORT_SYMBOL(kfree_put_link);
+
+/*
+ * nop .set_page_dirty method so that people can use .page_mkwrite on
+ * anon inodes.
+ */
+static int anon_set_page_dirty(struct page *page)
+{
+	return 0;
+};
+
+/*
+ * A single inode exists for all anon_inode files. Contrary to pipes,
+ * anon_inode inodes have no associated per-instance data, so we need
+ * only allocate one of them.
+ */
+struct inode *alloc_anon_inode(struct super_block *s)
+{
+	static const struct address_space_operations anon_aops = {
+		.set_page_dirty = anon_set_page_dirty,
+	};
+	struct inode *inode = new_inode_pseudo(s);
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	inode->i_ino = get_next_ino();
+	inode->i_mapping->a_ops = &anon_aops;
+
+	/*
+	 * Mark the inode dirty from the very beginning,
+	 * that way it will never be moved to the dirty
+	 * list because mark_inode_dirty() will think
+	 * that it already _is_ on the dirty list.
+	 */
+	inode->i_state = I_DIRTY;
+	inode->i_mode = S_IRUSR | S_IWUSR;
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
+	inode->i_flags |= S_PRIVATE;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	return inode;
+}
+EXPORT_SYMBOL(alloc_anon_inode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2b0f4e974480..e190326ac212 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2562,6 +2562,7 @@ extern int simple_write_begin(struct file *file, struct address_space *mapping,
 extern int simple_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata);
+extern struct inode *alloc_anon_inode(struct super_block *);
 
 extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags);
 extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
-- 
cgit v1.3


From 71ad7490c1f32bd7829df76360f9fa17829868f3 Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@kvack.org>
Date: Tue, 17 Sep 2013 10:18:25 -0400
Subject: rework aio migrate pages to use aio fs

Don't abuse anon_inodes.c to host private files needed by aio;
we can bloody well declare a mini-fs of our own instead of
patching up what anon_inodes can create for us.

Tested-by: Benjamin LaHaise <bcrl@kvack.org>
Acked-by: Benjamin LaHaise <bcrl@kvack.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/aio.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 57 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 067e3d340c35..1f602d9be4c5 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -36,10 +36,10 @@
 #include <linux/eventfd.h>
 #include <linux/blkdev.h>
 #include <linux/compat.h>
-#include <linux/anon_inodes.h>
 #include <linux/migrate.h>
 #include <linux/ramfs.h>
 #include <linux/percpu-refcount.h>
+#include <linux/mount.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -152,12 +152,67 @@ unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio request
 static struct kmem_cache	*kiocb_cachep;
 static struct kmem_cache	*kioctx_cachep;
 
+static struct vfsmount *aio_mnt;
+
+static const struct file_operations aio_ring_fops;
+static const struct address_space_operations aio_ctx_aops;
+
+static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
+{
+	struct qstr this = QSTR_INIT("[aio]", 5);
+	struct file *file;
+	struct path path;
+	struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	inode->i_mapping->a_ops = &aio_ctx_aops;
+	inode->i_mapping->private_data = ctx;
+	inode->i_size = PAGE_SIZE * nr_pages;
+
+	path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
+	if (!path.dentry) {
+		iput(inode);
+		return ERR_PTR(-ENOMEM);
+	}
+	path.mnt = mntget(aio_mnt);
+
+	d_instantiate(path.dentry, inode);
+	file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &aio_ring_fops);
+	if (IS_ERR(file)) {
+		path_put(&path);
+		return file;
+	}
+
+	file->f_flags = O_RDWR;
+	file->private_data = ctx;
+	return file;
+}
+
+static struct dentry *aio_mount(struct file_system_type *fs_type,
+				int flags, const char *dev_name, void *data)
+{
+	static const struct dentry_operations ops = {
+		.d_dname	= simple_dname,
+	};
+	return mount_pseudo(fs_type, "aio:", NULL, &ops, 0xa10a10a1);
+}
+
 /* aio_setup
  *	Creates the slab caches used by the aio routines, panic on
  *	failure as this is done early during the boot sequence.
  */
 static int __init aio_setup(void)
 {
+	static struct file_system_type aio_fs = {
+		.name		= "aio",
+		.mount		= aio_mount,
+		.kill_sb	= kill_anon_super,
+	};
+	aio_mnt = kern_mount(&aio_fs);
+	if (IS_ERR(aio_mnt))
+		panic("Failed to create aio fs mount.");
+
 	kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 
@@ -283,16 +338,12 @@ static int aio_setup_ring(struct kioctx *ctx)
 	if (nr_pages < 0)
 		return -EINVAL;
 
-	file = anon_inode_getfile_private("[aio]", &aio_ring_fops, ctx, O_RDWR);
+	file = aio_private_file(ctx, nr_pages);
 	if (IS_ERR(file)) {
 		ctx->aio_ring_file = NULL;
 		return -EAGAIN;
 	}
 
-	file->f_inode->i_mapping->a_ops = &aio_ctx_aops;
-	file->f_inode->i_mapping->private_data = ctx;
-	file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages;
-
 	for (i = 0; i < nr_pages; i++) {
 		struct page *page;
 		page = find_or_create_page(file->f_inode->i_mapping,
-- 
cgit v1.3


From 078d8e624c1837aa8ad65e58054a4a40d7ac46d2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 9 Oct 2013 10:26:28 -0400
Subject: ... and kill anon_inode_getfile_private()

it's a seriously misguided API, now fortunately without users.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/anon_inodes.c            | 66 ---------------------------------------------
 include/linux/anon_inodes.h |  3 ---
 2 files changed, 69 deletions(-)

(limited to 'fs')

diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 22f9698a1214..24084732b1d0 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -62,72 +62,6 @@ static struct file_system_type anon_inode_fs_type = {
 	.kill_sb	= kill_anon_super,
 };
 
-/**
- * anon_inode_getfile_private - creates a new file instance by hooking it up to an
- *                      anonymous inode, and a dentry that describe the "class"
- *                      of the file
- *
- * @name:    [in]    name of the "class" of the new file
- * @fops:    [in]    file operations for the new file
- * @priv:    [in]    private data for the new file (will be file's private_data)
- * @flags:   [in]    flags
- *
- *
- * Similar to anon_inode_getfile, but each file holds a single inode.
- *
- */
-struct file *anon_inode_getfile_private(const char *name,
-					const struct file_operations *fops,
-					void *priv, int flags)
-{
-	struct qstr this;
-	struct path path;
-	struct file *file;
-	struct inode *inode;
-
-	if (fops->owner && !try_module_get(fops->owner))
-		return ERR_PTR(-ENOENT);
-
-	inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
-	if (IS_ERR(inode)) {
-		file = ERR_PTR(-ENOMEM);
-		goto err_module;
-	}
-
-	/*
-	 * Link the inode to a directory entry by creating a unique name
-	 * using the inode sequence number.
-	 */
-	file = ERR_PTR(-ENOMEM);
-	this.name = name;
-	this.len = strlen(name);
-	this.hash = 0;
-	path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this);
-	if (!path.dentry)
-		goto err_module;
-
-	path.mnt = mntget(anon_inode_mnt);
-
-	d_instantiate(path.dentry, inode);
-
-	file = alloc_file(&path, OPEN_FMODE(flags), fops);
-	if (IS_ERR(file))
-		goto err_dput;
-
-	file->f_mapping = inode->i_mapping;
-	file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
-	file->private_data = priv;
-
-	return file;
-
-err_dput:
-	path_put(&path);
-err_module:
-	module_put(fops->owner);
-	return file;
-}
-EXPORT_SYMBOL_GPL(anon_inode_getfile_private);
-
 /**
  * anon_inode_getfile - creates a new file instance by hooking it up to an
  *                      anonymous inode, and a dentry that describe the "class"
diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h
index cf573c22b81e..8013a45242fe 100644
--- a/include/linux/anon_inodes.h
+++ b/include/linux/anon_inodes.h
@@ -13,9 +13,6 @@ struct file_operations;
 struct file *anon_inode_getfile(const char *name,
 				const struct file_operations *fops,
 				void *priv, int flags);
-struct file *anon_inode_getfile_private(const char *name,
-				const struct file_operations *fops,
-				void *priv, int flags);
 int anon_inode_getfd(const char *name, const struct file_operations *fops,
 		     void *priv, int flags);
 
-- 
cgit v1.3


From ce3959604878c1c693979ec552069dc8bdb5ccde Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 13 Oct 2013 17:23:53 -0400
Subject: constify copy_siginfo_to_user{,32}()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/arm64/kernel/signal32.c     | 2 +-
 arch/ia64/kernel/signal.c        | 2 +-
 arch/mips/kernel/signal32.c      | 2 +-
 arch/parisc/kernel/signal32.c    | 2 +-
 arch/parisc/kernel/signal32.h    | 2 +-
 arch/powerpc/kernel/signal_32.c  | 2 +-
 arch/s390/kernel/compat_signal.c | 2 +-
 arch/sparc/kernel/signal32.c     | 2 +-
 arch/tile/kernel/compat_signal.c | 2 +-
 arch/x86/ia32/ia32_signal.c      | 2 +-
 fs/binfmt_elf.c                  | 2 +-
 include/asm-generic/siginfo.h    | 2 +-
 include/linux/compat.h           | 2 +-
 kernel/signal.c                  | 2 +-
 14 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index e393174fe859..07da8c0fde2e 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -150,7 +150,7 @@ static inline int get_sigset_t(sigset_t *set,
 	return 0;
 }
 
-int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
+int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
 {
 	int err;
 
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 3637e03d2282..33cab9a8adff 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -105,7 +105,7 @@ restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr)
 }
 
 int
-copy_siginfo_to_user (siginfo_t __user *to, siginfo_t *from)
+copy_siginfo_to_user (siginfo_t __user *to, const siginfo_t *from)
 {
 	if (!access_ok(VERIFY_WRITE, to, sizeof(siginfo_t)))
 		return -EFAULT;
diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c
index 57de8b751627..1905a419aa46 100644
--- a/arch/mips/kernel/signal32.c
+++ b/arch/mips/kernel/signal32.c
@@ -314,7 +314,7 @@ SYSCALL_DEFINE3(32_sigaction, long, sig, const struct compat_sigaction __user *,
 	return ret;
 }
 
-int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
+int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
 {
 	int err;
 
diff --git a/arch/parisc/kernel/signal32.c b/arch/parisc/kernel/signal32.c
index 6c6a271a6140..984abbee71ca 100644
--- a/arch/parisc/kernel/signal32.c
+++ b/arch/parisc/kernel/signal32.c
@@ -319,7 +319,7 @@ copy_siginfo_from_user32 (siginfo_t *to, compat_siginfo_t __user *from)
 }
 
 int
-copy_siginfo_to_user32 (compat_siginfo_t __user *to, siginfo_t *from)
+copy_siginfo_to_user32 (compat_siginfo_t __user *to, const siginfo_t *from)
 {
 	compat_uptr_t addr;
 	compat_int_t val;
diff --git a/arch/parisc/kernel/signal32.h b/arch/parisc/kernel/signal32.h
index 72ab41a51f32..af51d4ccee42 100644
--- a/arch/parisc/kernel/signal32.h
+++ b/arch/parisc/kernel/signal32.h
@@ -34,7 +34,7 @@ struct compat_ucontext {
 
 /* ELF32 signal handling */
 
-int copy_siginfo_to_user32 (compat_siginfo_t __user *to, siginfo_t *from);
+int copy_siginfo_to_user32 (compat_siginfo_t __user *to, const siginfo_t *from);
 int copy_siginfo_from_user32 (siginfo_t *to, compat_siginfo_t __user *from);
 
 /* In a deft move of uber-hackery, we decide to carry the top half of all
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index bebdf1a1a540..149cd583d061 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -891,7 +891,7 @@ static long restore_tm_user_regs(struct pt_regs *regs,
 #endif
 
 #ifdef CONFIG_PPC64
-int copy_siginfo_to_user32(struct compat_siginfo __user *d, siginfo_t *s)
+int copy_siginfo_to_user32(struct compat_siginfo __user *d, const siginfo_t *s)
 {
 	int err;
 
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 1389b637dae5..44b25269a970 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -49,7 +49,7 @@ typedef struct
 	__u32 gprs_high[NUM_GPRS];
 } rt_sigframe32;
 
-int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
+int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
 {
 	int err;
 
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index b524f91dd0e5..ee789d2ef05d 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -68,7 +68,7 @@ struct rt_signal_frame32 {
 	/* __siginfo_rwin_t * */u32 rwin_save;
 } __attribute__((aligned(8)));
 
-int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
+int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
 {
 	int err;
 
diff --git a/arch/tile/kernel/compat_signal.c b/arch/tile/kernel/compat_signal.c
index 85e00b2f39bf..19c04b5ce408 100644
--- a/arch/tile/kernel/compat_signal.c
+++ b/arch/tile/kernel/compat_signal.c
@@ -49,7 +49,7 @@ struct compat_rt_sigframe {
 	struct compat_ucontext uc;
 };
 
-int copy_siginfo_to_user32(struct compat_siginfo __user *to, siginfo_t *from)
+int copy_siginfo_to_user32(struct compat_siginfo __user *to, const siginfo_t *from)
 {
 	int err;
 
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 665a730307f2..220675795e08 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -34,7 +34,7 @@
 #include <asm/sys_ia32.h>
 #include <asm/smap.h>
 
-int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
+int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
 {
 	int err = 0;
 	bool ia32 = test_thread_flag(TIF_IA32);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 864154972670..a6f8cb498b0e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1374,7 +1374,7 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
 }
 
 static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
-		siginfo_t *siginfo)
+		const siginfo_t *siginfo)
 {
 	mm_segment_t old_fs = get_fs();
 	set_fs(KERNEL_DS);
diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index b685d3bd32e2..3d1a3af5cf59 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -32,6 +32,6 @@ static inline void copy_siginfo(struct siginfo *to, struct siginfo *from)
 
 #endif
 
-extern int copy_siginfo_to_user(struct siginfo __user *to, struct siginfo *from);
+extern int copy_siginfo_to_user(struct siginfo __user *to, const struct siginfo *from);
 
 #endif
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 345da00a86e0..78cdf51ff5ba 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -362,7 +362,7 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
 long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
 		       unsigned long bitmap_size);
 int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from);
-int copy_siginfo_to_user32(struct compat_siginfo __user *to, siginfo_t *from);
+int copy_siginfo_to_user32(struct compat_siginfo __user *to, const siginfo_t *from);
 int get_compat_sigevent(struct sigevent *event,
 		const struct compat_sigevent __user *u_event);
 long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
diff --git a/kernel/signal.c b/kernel/signal.c
index ded28b91fa53..940b30ee9a30 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2723,7 +2723,7 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
 
 #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
 
-int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
+int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
 {
 	int err;
 
-- 
cgit v1.3


From ec57941e031685de434916e5398d0ca1d44cd374 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 13 Oct 2013 17:57:29 -0400
Subject: constify do_coredump() argument

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/binfmt_elf.c          | 4 ++--
 fs/coredump.c            | 2 +-
 include/linux/binfmts.h  | 2 +-
 include/linux/coredump.h | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index a6f8cb498b0e..1af412b96e06 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1581,7 +1581,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 
 static int fill_note_info(struct elfhdr *elf, int phdrs,
 			  struct elf_note_info *info,
-			  siginfo_t *siginfo, struct pt_regs *regs)
+			  const siginfo_t *siginfo, struct pt_regs *regs)
 {
 	struct task_struct *dump_task = current;
 	const struct user_regset_view *view = task_user_regset_view(dump_task);
@@ -1830,7 +1830,7 @@ static int elf_note_info_init(struct elf_note_info *info)
 
 static int fill_note_info(struct elfhdr *elf, int phdrs,
 			  struct elf_note_info *info,
-			  siginfo_t *siginfo, struct pt_regs *regs)
+			  const siginfo_t *siginfo, struct pt_regs *regs)
 {
 	struct list_head *t;
 
diff --git a/fs/coredump.c b/fs/coredump.c
index dc1f937413d5..62406b6959b6 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -485,7 +485,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
 	return err;
 }
 
-void do_coredump(siginfo_t *siginfo)
+void do_coredump(const siginfo_t *siginfo)
 {
 	struct core_state core_state;
 	struct core_name cn;
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 8aa507e7a41a..790d3305a5a7 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -56,7 +56,7 @@ struct linux_binprm {
 
 /* Function parameter for binfmt->coredump */
 struct coredump_params {
-	siginfo_t *siginfo;
+	const siginfo_t *siginfo;
 	struct pt_regs *regs;
 	struct file *file;
 	unsigned long limit;
diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index d8eb880be82a..d016a121a8c4 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -15,9 +15,9 @@ extern int dump_skip(struct coredump_params *cprm, size_t nr);
 extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr);
 extern int dump_align(struct coredump_params *cprm, int align);
 #ifdef CONFIG_COREDUMP
-extern void do_coredump(siginfo_t *siginfo);
+extern void do_coredump(const siginfo_t *siginfo);
 #else
-static inline void do_coredump(siginfo_t *siginfo) {}
+static inline void do_coredump(const siginfo_t *siginfo) {}
 #endif
 
 #endif /* _LINUX_COREDUMP_H */
-- 
cgit v1.3


From afabada957a6b28abfd37eb52efeefdfe6871c4b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 14 Oct 2013 07:39:56 -0400
Subject: elf{,_fdpic} coredump: get rid of pointless if (siginfo->si_signo)

we can't get to do_coredump() if that condition isn't satisfied...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/binfmt_elf.c       | 33 +++++++++++++++------------------
 fs/binfmt_elf_fdpic.c | 35 ++++++++++++++++-------------------
 2 files changed, 31 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 1af412b96e06..571a42326908 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1833,31 +1833,28 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 			  const siginfo_t *siginfo, struct pt_regs *regs)
 {
 	struct list_head *t;
+	struct core_thread *ct;
+	struct elf_thread_status *ets;
 
 	if (!elf_note_info_init(info))
 		return 0;
 
-	if (siginfo->si_signo) {
-		struct core_thread *ct;
-		struct elf_thread_status *ets;
-
-		for (ct = current->mm->core_state->dumper.next;
-						ct; ct = ct->next) {
-			ets = kzalloc(sizeof(*ets), GFP_KERNEL);
-			if (!ets)
-				return 0;
+	for (ct = current->mm->core_state->dumper.next;
+					ct; ct = ct->next) {
+		ets = kzalloc(sizeof(*ets), GFP_KERNEL);
+		if (!ets)
+			return 0;
 
-			ets->thread = ct->task;
-			list_add(&ets->list, &info->thread_list);
-		}
+		ets->thread = ct->task;
+		list_add(&ets->list, &info->thread_list);
+	}
 
-		list_for_each(t, &info->thread_list) {
-			int sz;
+	list_for_each(t, &info->thread_list) {
+		int sz;
 
-			ets = list_entry(t, struct elf_thread_status, list);
-			sz = elf_dump_thread_status(siginfo->si_signo, ets);
-			info->thread_status_size += sz;
-		}
+		ets = list_entry(t, struct elf_thread_status, list);
+		sz = elf_dump_thread_status(siginfo->si_signo, ets);
+		info->thread_status_size += sz;
 	}
 	/* now collect the dump for the current */
 	memset(info->prstatus, 0, sizeof(*info->prstatus));
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 645f6e56f378..fe2a643ee005 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1561,6 +1561,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 	struct elf_shdr *shdr4extnum = NULL;
 	Elf_Half e_phnum;
 	elf_addr_t e_shoff;
+	struct core_thread *ct;
+	struct elf_thread_status *tmp;
 
 	/*
 	 * We no longer stop all VM operations.
@@ -1596,28 +1598,23 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 		goto cleanup;
 #endif
 
-	if (cprm->siginfo->si_signo) {
-		struct core_thread *ct;
-		struct elf_thread_status *tmp;
-
-		for (ct = current->mm->core_state->dumper.next;
-						ct; ct = ct->next) {
-			tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
-			if (!tmp)
-				goto cleanup;
+	for (ct = current->mm->core_state->dumper.next;
+					ct; ct = ct->next) {
+		tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
+		if (!tmp)
+			goto cleanup;
 
-			tmp->thread = ct->task;
-			list_add(&tmp->list, &thread_list);
-		}
+		tmp->thread = ct->task;
+		list_add(&tmp->list, &thread_list);
+	}
 
-		list_for_each(t, &thread_list) {
-			struct elf_thread_status *tmp;
-			int sz;
+	list_for_each(t, &thread_list) {
+		struct elf_thread_status *tmp;
+		int sz;
 
-			tmp = list_entry(t, struct elf_thread_status, list);
-			sz = elf_dump_thread_status(cprm->siginfo->si_signo, tmp);
-			thread_status_size += sz;
-		}
+		tmp = list_entry(t, struct elf_thread_status, list);
+		sz = elf_dump_thread_status(cprm->siginfo->si_signo, tmp);
+		thread_status_size += sz;
 	}
 
 	/* now collect the dump for the current */
-- 
cgit v1.3


From b18825a7c8e37a7cf6abb97a12a6ad71af160de7 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 12 Sep 2013 19:22:53 +0100
Subject: VFS: Put a small type field into struct dentry::d_flags

Put a type field into struct dentry::d_flags to indicate if the dentry is one
of the following types that relate particularly to pathwalk:

	Miss (negative dentry)
	Directory
	"Automount" directory (defective - no i_op->lookup())
	Symlink
	Other (regular, socket, fifo, device)

The type field is set to one of the first five types on a dentry by calls to
__d_instantiate() and d_obtain_alias() from information in the inode (if one is
given).

The type is cleared by dentry_unlink_inode() when it reconstitutes an existing
dentry as a negative dentry.

Accessors provided are:

	d_set_type(dentry, type)
	d_is_directory(dentry)
	d_is_autodir(dentry)
	d_is_symlink(dentry)
	d_is_file(dentry)
	d_is_negative(dentry)
	d_is_positive(dentry)

A bunch of checks in pathname resolution switched to those.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            |  42 +++++++++++++++++---
 fs/namei.c             |  95 ++++++++++++++++++---------------------------
 include/linux/dcache.h | 103 +++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 158 insertions(+), 82 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index fb7bcf3ba5d6..525770e576db 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -343,6 +343,7 @@ static void dentry_unlink_inode(struct dentry * dentry)
 	__releases(dentry->d_inode->i_lock)
 {
 	struct inode *inode = dentry->d_inode;
+	__d_clear_type(dentry);
 	dentry->d_inode = NULL;
 	hlist_del_init(&dentry->d_alias);
 	dentry_rcuwalk_barrier(dentry);
@@ -1648,14 +1649,42 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
 }
 EXPORT_SYMBOL(d_set_d_op);
 
+static unsigned d_flags_for_inode(struct inode *inode)
+{
+	unsigned add_flags = DCACHE_FILE_TYPE;
+
+	if (!inode)
+		return DCACHE_MISS_TYPE;
+
+	if (S_ISDIR(inode->i_mode)) {
+		add_flags = DCACHE_DIRECTORY_TYPE;
+		if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) {
+			if (unlikely(!inode->i_op->lookup))
+				add_flags = DCACHE_AUTODIR_TYPE;
+			else
+				inode->i_opflags |= IOP_LOOKUP;
+		}
+	} else if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
+		if (unlikely(inode->i_op->follow_link))
+			add_flags = DCACHE_SYMLINK_TYPE;
+		else
+			inode->i_opflags |= IOP_NOFOLLOW;
+	}
+
+	if (unlikely(IS_AUTOMOUNT(inode)))
+		add_flags |= DCACHE_NEED_AUTOMOUNT;
+	return add_flags;
+}
+
 static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 {
+	unsigned add_flags = d_flags_for_inode(inode);
+
 	spin_lock(&dentry->d_lock);
-	if (inode) {
-		if (unlikely(IS_AUTOMOUNT(inode)))
-			dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+	dentry->d_flags &= ~DCACHE_ENTRY_TYPE;
+	dentry->d_flags |= add_flags;
+	if (inode)
 		hlist_add_head(&dentry->d_alias, &inode->i_dentry);
-	}
 	dentry->d_inode = inode;
 	dentry_rcuwalk_barrier(dentry);
 	spin_unlock(&dentry->d_lock);
@@ -1860,6 +1889,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	static const struct qstr anonstring = QSTR_INIT("/", 1);
 	struct dentry *tmp;
 	struct dentry *res;
+	unsigned add_flags;
 
 	if (!inode)
 		return ERR_PTR(-ESTALE);
@@ -1885,9 +1915,11 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	}
 
 	/* attach a disconnected dentry */
+	add_flags = d_flags_for_inode(inode) | DCACHE_DISCONNECTED;
+
 	spin_lock(&tmp->d_lock);
 	tmp->d_inode = inode;
-	tmp->d_flags |= DCACHE_DISCONNECTED;
+	tmp->d_flags |= add_flags;
 	hlist_add_head(&tmp->d_alias, &inode->i_dentry);
 	hlist_bl_lock(&tmp->d_sb->s_anon);
 	hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon);
diff --git a/fs/namei.c b/fs/namei.c
index e5c0118ba9f8..e1fa43346c61 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1501,18 +1501,9 @@ static void terminate_walk(struct nameidata *nd)
  * so we keep a cache of "no, this doesn't need follow_link"
  * for the common case.
  */
-static inline int should_follow_link(struct inode *inode, int follow)
+static inline int should_follow_link(struct dentry *dentry, int follow)
 {
-	if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
-		if (likely(inode->i_op->follow_link))
-			return follow;
-
-		/* This gets set once for the inode lifetime */
-		spin_lock(&inode->i_lock);
-		inode->i_opflags |= IOP_NOFOLLOW;
-		spin_unlock(&inode->i_lock);
-	}
-	return 0;
+	return unlikely(d_is_symlink(dentry)) ? follow : 0;
 }
 
 static inline int walk_component(struct nameidata *nd, struct path *path,
@@ -1542,7 +1533,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
 	if (!inode)
 		goto out_path_put;
 
-	if (should_follow_link(inode, follow)) {
+	if (should_follow_link(path->dentry, follow)) {
 		if (nd->flags & LOOKUP_RCU) {
 			if (unlikely(unlazy_walk(nd, path->dentry))) {
 				err = -ECHILD;
@@ -1600,26 +1591,6 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
 	return res;
 }
 
-/*
- * We really don't want to look at inode->i_op->lookup
- * when we don't have to. So we keep a cache bit in
- * the inode ->i_opflags field that says "yes, we can
- * do lookup on this inode".
- */
-static inline int can_lookup(struct inode *inode)
-{
-	if (likely(inode->i_opflags & IOP_LOOKUP))
-		return 1;
-	if (likely(!inode->i_op->lookup))
-		return 0;
-
-	/* We do this once for the lifetime of the inode */
-	spin_lock(&inode->i_lock);
-	inode->i_opflags |= IOP_LOOKUP;
-	spin_unlock(&inode->i_lock);
-	return 1;
-}
-
 /*
  * We can do the critical dentry name comparison and hashing
  * operations one word at a time, but we are limited to:
@@ -1823,7 +1794,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			if (err)
 				return err;
 		}
-		if (!can_lookup(nd->inode)) {
+		if (!d_is_directory(nd->path.dentry)) {
 			err = -ENOTDIR; 
 			break;
 		}
@@ -1841,9 +1812,10 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 	nd->flags = flags | LOOKUP_JUMPED;
 	nd->depth = 0;
 	if (flags & LOOKUP_ROOT) {
-		struct inode *inode = nd->root.dentry->d_inode;
+		struct dentry *root = nd->root.dentry;
+		struct inode *inode = root->d_inode;
 		if (*name) {
-			if (!can_lookup(inode))
+			if (!d_is_directory(root))
 				return -ENOTDIR;
 			retval = inode_permission(inode, MAY_EXEC);
 			if (retval)
@@ -1899,7 +1871,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 		dentry = f.file->f_path.dentry;
 
 		if (*name) {
-			if (!can_lookup(dentry->d_inode)) {
+			if (!d_is_directory(dentry)) {
 				fdput(f);
 				return -ENOTDIR;
 			}
@@ -1981,7 +1953,7 @@ static int path_lookupat(int dfd, const char *name,
 		err = complete_walk(nd);
 
 	if (!err && nd->flags & LOOKUP_DIRECTORY) {
-		if (!can_lookup(nd->inode)) {
+		if (!d_is_directory(nd->path.dentry)) {
 			path_put(&nd->path);
 			err = -ENOTDIR;
 		}
@@ -2273,7 +2245,7 @@ done:
 	}
 	path->dentry = dentry;
 	path->mnt = mntget(nd->path.mnt);
-	if (should_follow_link(dentry->d_inode, nd->flags & LOOKUP_FOLLOW))
+	if (should_follow_link(dentry, nd->flags & LOOKUP_FOLLOW))
 		return 1;
 	follow_mount(path);
 	error = 0;
@@ -2417,12 +2389,14 @@ static inline int check_sticky(struct inode *dir, struct inode *inode)
  * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
  *     nfs_async_unlink().
  */
-static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
+static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
 {
+	struct inode *inode = victim->d_inode;
 	int error;
 
-	if (!victim->d_inode)
+	if (d_is_negative(victim))
 		return -ENOENT;
+	BUG_ON(!inode);
 
 	BUG_ON(victim->d_parent->d_inode != dir);
 	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
@@ -2432,15 +2406,16 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
 		return error;
 	if (IS_APPEND(dir))
 		return -EPERM;
-	if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
-	    IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
+
+	if (check_sticky(dir, inode) || IS_APPEND(inode) ||
+	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode))
 		return -EPERM;
 	if (isdir) {
-		if (!S_ISDIR(victim->d_inode->i_mode))
+		if (!d_is_directory(victim) && !d_is_autodir(victim))
 			return -ENOTDIR;
 		if (IS_ROOT(victim))
 			return -EBUSY;
-	} else if (S_ISDIR(victim->d_inode->i_mode))
+	} else if (d_is_directory(victim) || d_is_autodir(victim))
 		return -EISDIR;
 	if (IS_DEADDIR(dir))
 		return -ENOENT;
@@ -2974,7 +2949,7 @@ retry_lookup:
 	/*
 	 * create/update audit record if it already exists.
 	 */
-	if (path->dentry->d_inode)
+	if (d_is_positive(path->dentry))
 		audit_inode(name, path->dentry, 0);
 
 	/*
@@ -3003,12 +2978,12 @@ retry_lookup:
 finish_lookup:
 	/* we _can_ be in RCU mode here */
 	error = -ENOENT;
-	if (!inode) {
+	if (d_is_negative(path->dentry)) {
 		path_to_nameidata(path, nd);
 		goto out;
 	}
 
-	if (should_follow_link(inode, !symlink_ok)) {
+	if (should_follow_link(path->dentry, !symlink_ok)) {
 		if (nd->flags & LOOKUP_RCU) {
 			if (unlikely(unlazy_walk(nd, path->dentry))) {
 				error = -ECHILD;
@@ -3037,10 +3012,11 @@ finish_open:
 	}
 	audit_inode(name, nd->path.dentry, 0);
 	error = -EISDIR;
-	if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode))
+	if ((open_flag & O_CREAT) &&
+	    (d_is_directory(nd->path.dentry) || d_is_autodir(nd->path.dentry)))
 		goto out;
 	error = -ENOTDIR;
-	if ((nd->flags & LOOKUP_DIRECTORY) && !can_lookup(nd->inode))
+	if ((nd->flags & LOOKUP_DIRECTORY) && !d_is_directory(nd->path.dentry))
 		goto out;
 	if (!S_ISREG(nd->inode->i_mode))
 		will_truncate = false;
@@ -3266,7 +3242,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 	nd.root.mnt = mnt;
 	nd.root.dentry = dentry;
 
-	if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
+	if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
 		return ERR_PTR(-ELOOP);
 
 	file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU);
@@ -3316,8 +3292,9 @@ struct dentry *kern_path_create(int dfd, const char *pathname,
 		goto unlock;
 
 	error = -EEXIST;
-	if (dentry->d_inode)
+	if (d_is_positive(dentry))
 		goto fail;
+
 	/*
 	 * Special case - lookup gave negative, but... we had foo/bar/
 	 * From the vfs_mknod() POV we just have a negative dentry -
@@ -3706,7 +3683,7 @@ retry:
 		if (nd.last.name[nd.last.len])
 			goto slashes;
 		inode = dentry->d_inode;
-		if (!inode)
+		if (d_is_negative(dentry))
 			goto slashes;
 		ihold(inode);
 		error = security_path_unlink(&nd.path, dentry);
@@ -3731,8 +3708,12 @@ exit1:
 	return error;
 
 slashes:
-	error = !dentry->d_inode ? -ENOENT :
-		S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
+	if (d_is_negative(dentry))
+		error = -ENOENT;
+	else if (d_is_directory(dentry) || d_is_autodir(dentry))
+		error = -EISDIR;
+	else
+		error = -ENOTDIR;
 	goto exit2;
 }
 
@@ -4046,7 +4027,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	       struct inode *new_dir, struct dentry *new_dentry)
 {
 	int error;
-	int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
+	int is_dir = d_is_directory(old_dentry) || d_is_autodir(old_dentry);
 	const unsigned char *old_name;
 
 	if (old_dentry->d_inode == new_dentry->d_inode)
@@ -4134,10 +4115,10 @@ retry:
 		goto exit3;
 	/* source must exist */
 	error = -ENOENT;
-	if (!old_dentry->d_inode)
+	if (d_is_negative(old_dentry))
 		goto exit4;
 	/* unless the source is a directory trailing slashes give -ENOTDIR */
-	if (!S_ISDIR(old_dentry->d_inode->i_mode)) {
+	if (!d_is_directory(old_dentry) && !d_is_autodir(old_dentry)) {
 		error = -ENOTDIR;
 		if (oldnd.last.name[oldnd.last.len])
 			goto exit4;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 716c3760ee39..57e87e749a48 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -169,13 +169,13 @@ struct dentry_operations {
  */
 
 /* d_flags entries */
-#define DCACHE_OP_HASH		0x0001
-#define DCACHE_OP_COMPARE	0x0002
-#define DCACHE_OP_REVALIDATE	0x0004
-#define DCACHE_OP_DELETE	0x0008
-#define DCACHE_OP_PRUNE         0x0010
+#define DCACHE_OP_HASH			0x00000001
+#define DCACHE_OP_COMPARE		0x00000002
+#define DCACHE_OP_REVALIDATE		0x00000004
+#define DCACHE_OP_DELETE		0x00000008
+#define DCACHE_OP_PRUNE			0x00000010
 
-#define	DCACHE_DISCONNECTED	0x0020
+#define	DCACHE_DISCONNECTED		0x00000020
      /* This dentry is possibly not currently connected to the dcache tree, in
       * which case its parent will either be itself, or will have this flag as
       * well.  nfsd will not use a dentry with this bit set, but will first
@@ -186,30 +186,38 @@ struct dentry_operations {
       * dentry into place and return that dentry rather than the passed one,
       * typically using d_splice_alias. */
 
-#define DCACHE_REFERENCED	0x0040  /* Recently used, don't discard. */
-#define DCACHE_RCUACCESS	0x0080	/* Entry has ever been RCU-visible */
+#define DCACHE_REFERENCED		0x00000040 /* Recently used, don't discard. */
+#define DCACHE_RCUACCESS		0x00000080 /* Entry has ever been RCU-visible */
 
-#define DCACHE_CANT_MOUNT	0x0100
-#define DCACHE_GENOCIDE		0x0200
-#define DCACHE_SHRINK_LIST	0x0400
+#define DCACHE_CANT_MOUNT		0x00000100
+#define DCACHE_GENOCIDE			0x00000200
+#define DCACHE_SHRINK_LIST		0x00000400
 
-#define DCACHE_OP_WEAK_REVALIDATE	0x0800
+#define DCACHE_OP_WEAK_REVALIDATE	0x00000800
 
-#define DCACHE_NFSFS_RENAMED	0x1000
+#define DCACHE_NFSFS_RENAMED		0x00001000
      /* this dentry has been "silly renamed" and has to be deleted on the last
       * dput() */
-#define DCACHE_COOKIE		0x2000	/* For use by dcookie subsystem */
-#define DCACHE_FSNOTIFY_PARENT_WATCHED 0x4000
+#define DCACHE_COOKIE			0x00002000 /* For use by dcookie subsystem */
+#define DCACHE_FSNOTIFY_PARENT_WATCHED	0x00004000
      /* Parent inode is watched by some fsnotify listener */
 
-#define DCACHE_MOUNTED		0x10000	/* is a mountpoint */
-#define DCACHE_NEED_AUTOMOUNT	0x20000	/* handle automount on this dir */
-#define DCACHE_MANAGE_TRANSIT	0x40000	/* manage transit from this dirent */
+#define DCACHE_DENTRY_KILLED		0x00008000
+
+#define DCACHE_MOUNTED			0x00010000 /* is a mountpoint */
+#define DCACHE_NEED_AUTOMOUNT		0x00020000 /* handle automount on this dir */
+#define DCACHE_MANAGE_TRANSIT		0x00040000 /* manage transit from this dirent */
 #define DCACHE_MANAGED_DENTRY \
 	(DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT)
 
-#define DCACHE_LRU_LIST		0x80000
-#define DCACHE_DENTRY_KILLED	0x100000
+#define DCACHE_LRU_LIST			0x00080000
+
+#define DCACHE_ENTRY_TYPE		0x00700000
+#define DCACHE_MISS_TYPE		0x00000000 /* Negative dentry */
+#define DCACHE_DIRECTORY_TYPE		0x00100000 /* Normal directory */
+#define DCACHE_AUTODIR_TYPE		0x00200000 /* Lookupless directory (presumed automount) */
+#define DCACHE_SYMLINK_TYPE		0x00300000 /* Symlink */
+#define DCACHE_FILE_TYPE		0x00400000 /* Other file type */
 
 extern seqlock_t rename_lock;
 
@@ -394,6 +402,61 @@ static inline bool d_mountpoint(const struct dentry *dentry)
 	return dentry->d_flags & DCACHE_MOUNTED;
 }
 
+/*
+ * Directory cache entry type accessor functions.
+ */
+static inline void __d_set_type(struct dentry *dentry, unsigned type)
+{
+	dentry->d_flags = (dentry->d_flags & ~DCACHE_ENTRY_TYPE) | type;
+}
+
+static inline void __d_clear_type(struct dentry *dentry)
+{
+	__d_set_type(dentry, DCACHE_MISS_TYPE);
+}
+
+static inline void d_set_type(struct dentry *dentry, unsigned type)
+{
+	spin_lock(&dentry->d_lock);
+	__d_set_type(dentry, type);
+	spin_unlock(&dentry->d_lock);
+}
+
+static inline unsigned __d_entry_type(const struct dentry *dentry)
+{
+	return dentry->d_flags & DCACHE_ENTRY_TYPE;
+}
+
+static inline bool d_is_directory(const struct dentry *dentry)
+{
+	return __d_entry_type(dentry) == DCACHE_DIRECTORY_TYPE;
+}
+
+static inline bool d_is_autodir(const struct dentry *dentry)
+{
+	return __d_entry_type(dentry) == DCACHE_AUTODIR_TYPE;
+}
+
+static inline bool d_is_symlink(const struct dentry *dentry)
+{
+	return __d_entry_type(dentry) == DCACHE_SYMLINK_TYPE;
+}
+
+static inline bool d_is_file(const struct dentry *dentry)
+{
+	return __d_entry_type(dentry) == DCACHE_FILE_TYPE;
+}
+
+static inline bool d_is_negative(const struct dentry *dentry)
+{
+	return __d_entry_type(dentry) == DCACHE_MISS_TYPE;
+}
+
+static inline bool d_is_positive(const struct dentry *dentry)
+{
+	return !d_is_negative(dentry);
+}
+
 extern int sysctl_vfs_cache_pressure;
 
 static inline unsigned long vfs_pressure_ratio(unsigned long val)
-- 
cgit v1.3


From 5a3cd99285dc793a4022fa75ceeb323eb6d29ac9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 6 Nov 2013 09:54:52 -0500
Subject: iget/iget5: don't bother with ->i_lock until we find a match

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting           |  8 ++++++++
 drivers/staging/lustre/lustre/llite/namei.c |  2 +-
 fs/inode.c                                  | 20 ++++++--------------
 fs/ntfs/inode.c                             |  2 +-
 4 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index f0890581f7f6..fe2b7ae6f962 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -455,3 +455,11 @@ in your dentry operations instead.
 	vfs_follow_link has been removed.  Filesystems must use nd_set_link
 	from ->follow_link for normal symlinks, or nd_jump_link for magic
 	/proc/<pid> style links.
+--
+[mandatory]
+	iget5_locked()/ilookup5()/ilookup5_nowait() test() callback used to be
+	called with both ->i_lock and inode_hash_lock held; the former is *not*
+	taken anymore, so verify that your callbacks do not rely on it (none
+	of the in-tree instances did).  inode_hash_lock is still held,
+	of course, so they are still serialized wrt removal from inode hash,
+	as well as wrt set() callback of iget5_locked().
diff --git a/drivers/staging/lustre/lustre/llite/namei.c b/drivers/staging/lustre/lustre/llite/namei.c
index 34815b550e71..90bbdae824ac 100644
--- a/drivers/staging/lustre/lustre/llite/namei.c
+++ b/drivers/staging/lustre/lustre/llite/namei.c
@@ -83,7 +83,7 @@ int ll_unlock(__u32 mode, struct lustre_handle *lockh)
 }
 
 
-/* called from iget5_locked->find_inode() under inode_lock spinlock */
+/* called from iget5_locked->find_inode() under inode_hash_lock spinlock */
 static int ll_test_inode(struct inode *inode, void *opaque)
 {
 	struct ll_inode_info *lli = ll_i2info(inode);
diff --git a/fs/inode.c b/fs/inode.c
index b33ba8e021cc..bf7a8b3ab937 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -773,15 +773,11 @@ static struct inode *find_inode(struct super_block *sb,
 
 repeat:
 	hlist_for_each_entry(inode, head, i_hash) {
-		spin_lock(&inode->i_lock);
-		if (inode->i_sb != sb) {
-			spin_unlock(&inode->i_lock);
+		if (inode->i_sb != sb)
 			continue;
-		}
-		if (!test(inode, data)) {
-			spin_unlock(&inode->i_lock);
+		if (!test(inode, data))
 			continue;
-		}
+		spin_lock(&inode->i_lock);
 		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
 			__wait_on_freeing_inode(inode);
 			goto repeat;
@@ -804,15 +800,11 @@ static struct inode *find_inode_fast(struct super_block *sb,
 
 repeat:
 	hlist_for_each_entry(inode, head, i_hash) {
-		spin_lock(&inode->i_lock);
-		if (inode->i_ino != ino) {
-			spin_unlock(&inode->i_lock);
+		if (inode->i_ino != ino)
 			continue;
-		}
-		if (inode->i_sb != sb) {
-			spin_unlock(&inode->i_lock);
+		if (inode->i_sb != sb)
 			continue;
-		}
+		spin_lock(&inode->i_lock);
 		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
 			__wait_on_freeing_inode(inode);
 			goto repeat;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 2778b0255dc6..ffb9b3675736 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -55,7 +55,7 @@
  *
  * Return 1 if the attributes match and 0 if not.
  *
- * NOTE: This function runs with the inode->i_lock spin lock held so it is not
+ * NOTE: This function runs with the inode_hash_lock spin lock held so it is not
  * allowed to sleep.
  */
 int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
-- 
cgit v1.3


From b7a6ec52dd4eced4a9bcda9ca85b3c8af84d3c90 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 2 Oct 2013 17:01:18 -0400
Subject: vfs: split out vfs_getattr_nosec

The filehandle lookup code wants this version of getattr.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/stat.c          | 31 +++++++++++++++++++++++++------
 include/linux/fs.h |  1 +
 2 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/stat.c b/fs/stat.c
index d0ea7ef75e26..ae0c3cef9927 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -37,14 +37,21 @@ void generic_fillattr(struct inode *inode, struct kstat *stat)
 
 EXPORT_SYMBOL(generic_fillattr);
 
-int vfs_getattr(struct path *path, struct kstat *stat)
+/**
+ * vfs_getattr_nosec - getattr without security checks
+ * @path: file to get attributes from
+ * @stat: structure to return attributes in
+ *
+ * Get attributes without calling security_inode_getattr.
+ *
+ * Currently the only caller other than vfs_getattr is internal to the
+ * filehandle lookup code, which uses only the inode number and returns
+ * no attributes to any user.  Any other code probably wants
+ * vfs_getattr.
+ */
+int vfs_getattr_nosec(struct path *path, struct kstat *stat)
 {
 	struct inode *inode = path->dentry->d_inode;
-	int retval;
-
-	retval = security_inode_getattr(path->mnt, path->dentry);
-	if (retval)
-		return retval;
 
 	if (inode->i_op->getattr)
 		return inode->i_op->getattr(path->mnt, path->dentry, stat);
@@ -53,6 +60,18 @@ int vfs_getattr(struct path *path, struct kstat *stat)
 	return 0;
 }
 
+EXPORT_SYMBOL(vfs_getattr_nosec);
+
+int vfs_getattr(struct path *path, struct kstat *stat)
+{
+	int retval;
+
+	retval = security_inode_getattr(path->mnt, path->dentry);
+	if (retval)
+		return retval;
+	return vfs_getattr_nosec(path, stat);
+}
+
 EXPORT_SYMBOL(vfs_getattr);
 
 int vfs_fstat(unsigned int fd, struct kstat *stat)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e190326ac212..5e44b0893db8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2504,6 +2504,7 @@ extern const struct inode_operations page_symlink_inode_operations;
 extern void kfree_put_link(struct dentry *, struct nameidata *, void *);
 extern int generic_readlink(struct dentry *, char __user *, int);
 extern void generic_fillattr(struct inode *, struct kstat *);
+int vfs_getattr_nosec(struct path *path, struct kstat *stat);
 extern int vfs_getattr(struct path *, struct kstat *);
 void __inode_add_bytes(struct inode *inode, loff_t bytes);
 void inode_add_bytes(struct inode *inode, loff_t bytes);
-- 
cgit v1.3


From 950ee9566a5b6cc45d15f5fe044bab4f1e8b62cb Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 10 Sep 2013 11:41:12 -0400
Subject: exportfs: fix 32-bit nfsd handling of 64-bit inode numbers

Symptoms were spurious -ENOENTs on stat of an NFS filesystem from a
32-bit NFS server exporting a very large XFS filesystem, when the
server's cache is cold (so the inodes in question are not in cache).

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reported-by: Trevor Cordes <trevor@tecnopolis.ca>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exportfs/expfs.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index a235f0016889..c43fe9b39ff2 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -215,7 +215,7 @@ struct getdents_callback {
 	struct dir_context ctx;
 	char *name;		/* name that was found. It already points to a
 				   buffer NAME_MAX+1 is size */
-	unsigned long ino;	/* the inum we are looking for */
+	u64 ino;		/* the inum we are looking for */
 	int found;		/* inode matched? */
 	int sequence;		/* sequence counter */
 };
@@ -255,10 +255,14 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
 	struct inode *dir = path->dentry->d_inode;
 	int error;
 	struct file *file;
+	struct kstat stat;
+	struct path child_path = {
+		.mnt = path->mnt,
+		.dentry = child,
+	};
 	struct getdents_callback buffer = {
 		.ctx.actor = filldir_one,
 		.name = name,
-		.ino = child->d_inode->i_ino
 	};
 
 	error = -ENOTDIR;
@@ -267,6 +271,16 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
 	error = -EINVAL;
 	if (!dir->i_fop)
 		goto out;
+	/*
+	 * inode->i_ino is unsigned long, kstat->ino is u64, so the
+	 * former would be insufficient on 32-bit hosts when the
+	 * filesystem supports 64-bit inode numbers.  So we need to
+	 * actually call ->getattr, not just read i_ino:
+	 */
+	error = vfs_getattr_nosec(&child_path, &stat);
+	if (error)
+		return error;
+	buffer.ino = stat.ino;
 	/*
 	 * Open the directory ...
 	 */
-- 
cgit v1.3


From fbad2bd13235067eef0b5b5e18b40e84ca425815 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 3 Nov 2013 19:46:35 -0500
Subject: qnx4: i_sb is never NULL

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/qnx4/namei.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index d024505ba007..e62c8183777a 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -60,10 +60,6 @@ static struct buffer_head *qnx4_find_entry(int len, struct inode *dir,
 	struct buffer_head *bh;
 
 	*res_dir = NULL;
-	if (!dir->i_sb) {
-		printk(KERN_WARNING "qnx4: no superblock on dir.\n");
-		return NULL;
-	}
 	bh = NULL;
 	block = offset = blkofs = 0;
 	while (blkofs * QNX4_BLOCK_SIZE + offset < dir->i_size) {
-- 
cgit v1.3


From b19f13367454a57cd584031f0d52b05ac5ea97cd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 3 Nov 2013 19:49:19 -0500
Subject: ocfs2: get rid of impossible checks

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ocfs2/inode.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index f87f9bd1edff..f29a90fde619 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -386,19 +386,9 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 	u32 generation = 0;
 
 	status = -EINVAL;
-	if (inode == NULL || inode->i_sb == NULL) {
-		mlog(ML_ERROR, "bad inode\n");
-		return status;
-	}
 	sb = inode->i_sb;
 	osb = OCFS2_SB(sb);
 
-	if (!args) {
-		mlog(ML_ERROR, "bad inode args\n");
-		make_bad_inode(inode);
-		return status;
-	}
-
 	/*
 	 * To improve performance of cold-cache inode stats, we take
 	 * the cluster lock here if possible.
-- 
cgit v1.3


From 7632e465feb182cadc3c9aa1282a057201818a8c Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 28 Jun 2012 12:10:55 -0400
Subject: dcache: use IS_ROOT to decide where dentry is hashed

Every hashed dentry is either hashed in the dentry_hashtable, or a
superblock's s_anon list.

__d_drop() assumes it can determine which is the case by checking
DCACHE_DISCONNECTED; this is not true.

It is true that when DCACHE_DISCONNECTED is cleared, the dentry is not
only hashed on dentry_hashtable, but is fully connected to its parents
back to the root.

But the converse is *not* true: fs/exportfs/expfs.c:reconnect_path()
attempts to connect a directory (found by filehandle lookup) back to
root by ascending to parents and performing lookups one at a time.  It
does not clear DCACHE_DISCONNECTED until it's done, and that is not at
all an atomic process.

In particular, it is possible for DCACHE_DISCONNECTED to be set on a
dentry which is hashed on the dentry_hashtable.

Instead, use IS_ROOT() to check which hash chain a dentry is on.  This
*does* work:

Dentries are hashed only by:

	- d_obtain_alias, which adds an IS_ROOT() dentry to sb_anon.

	- __d_rehash, called by _d_rehash: hashes to the dentry's
	  parent, and all callers of _d_rehash appear to have d_parent
	  set to a "real" parent.
	- __d_rehash, called by __d_move: rehashes the moved dentry to
	  hash chain determined by target, and assigns target's d_parent
	  to its d_parent, before dropping the dentry's d_lock.

Therefore I believe it's safe for a holder of a dentry's d_lock to
assume that it is hashed on sb_anon if and only if IS_ROOT(dentry) is
true.

I believe the incorrect assumption about DCACHE_DISCONNECTED was
originally introduced by ceb5bdc2d246 "fs: dcache per-bucket dcache hash
locking".

Also add a comment while we're here.

Cc: Nick Piggin <npiggin@kernel.dk>
Acked-by: Christoph Hellwig <hch@infradead.org>
Reviewed-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 525770e576db..c2add37082b6 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -503,7 +503,12 @@ void __d_drop(struct dentry *dentry)
 {
 	if (!d_unhashed(dentry)) {
 		struct hlist_bl_head *b;
-		if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
+		/*
+		 * Hashed dentries are normally on the dentry hashtable,
+		 * with the exception of those newly allocated by
+		 * d_obtain_alias, which are always IS_ROOT:
+		 */
+		if (unlikely(IS_ROOT(dentry)))
 			b = &dentry->d_sb->s_anon;
 		else
 			b = d_hash(dentry->d_parent, dentry->d_name.hash);
-- 
cgit v1.3


From e1a24bb0aa6abaa86b5a95638722ea2036dbaadd Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 29 Jun 2012 16:20:47 -0400
Subject: dcache: Don't set DISCONNECTED on "pseudo filesystem" dentries

I can't for the life of me see any reason why anyone should care whether
a dentry that is never hooked into the dentry cache would need
DCACHE_DISCONNECTED set.

This originates from 4b936885ab04dc6e0bb0ef35e0e23c1a7364d9e5 "fs:
improve scalability of pseudo filesystems", which probably just made the
false assumption the DCACHE_DISCONNECTED was meant to be set on anything
not connected to a parent somehow.

So this is just confusing.  Ideally the only uses of DCACHE_DISCONNECTED
would be in the filehandle-lookup code, which needs it to ensure
dentries are connected into the dentry tree before use.

I left d_alloc_pseudo there even though it's now equivalent to
__d_alloc(), just on the theory the name is better documentation of its
intended use outside dcache.c.

Cc: Nick Piggin <npiggin@kernel.dk>
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index c2add37082b6..67662beebc72 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1607,12 +1607,17 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 }
 EXPORT_SYMBOL(d_alloc);
 
+/**
+ * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems)
+ * @sb: the superblock
+ * @name: qstr of the name
+ *
+ * For a filesystem that just pins its dentries in memory and never
+ * performs lookups at all, return an unhashed IS_ROOT dentry.
+ */
 struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
 {
-	struct dentry *dentry = __d_alloc(sb, name);
-	if (dentry)
-		dentry->d_flags |= DCACHE_DISCONNECTED;
-	return dentry;
+	return __d_alloc(sb, name);
 }
 EXPORT_SYMBOL(d_alloc_pseudo);
 
-- 
cgit v1.3


From f80de2cde10350b8d146e375ff8b634e72e6a827 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 18 Jul 2012 16:27:37 -0600
Subject: dcache: don't clear DCACHE_DISCONNECTED too early

DCACHE_DISCONNECTED should not be cleared until we're sure the dentry is
connected all the way up to the root of the filesystem.  It *shouldn't*
be cleared as soon as the dentry is connected to a parent.  That will
cause bugs at least on exportable filesystems.

Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 67662beebc72..72c99b88d264 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2757,7 +2757,6 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 	spin_unlock(&dentry->d_lock);
 
 	/* anon->d_lock still locked, returns locked */
-	anon->d_flags &= ~DCACHE_DISCONNECTED;
 }
 
 /**
-- 
cgit v1.3


From 13a2c3be03973d61b6cb89ff870e758c86327bb7 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 23 Oct 2013 16:09:16 -0400
Subject: dcache: fix outdated DCACHE_NEED_LOOKUP comment

The DCACHE_NEED_LOOKUP case referred to here was removed with
39e3c9553f34381a1b664c27b0c696a266a5735e "vfs: remove
DCACHE_NEED_LOOKUP".

There are only four real_lookup() callers and all of them pass in an
unhashed dentry just returned from d_alloc.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index e1fa43346c61..2a5a7aa9f43f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1298,8 +1298,8 @@ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
 }
 
 /*
- * Call i_op->lookup on the dentry.  The dentry must be negative but may be
- * hashed if it was pouplated with DCACHE_NEED_LOOKUP.
+ * Call i_op->lookup on the dentry.  The dentry must be negative and
+ * unhashed.
  *
  * dir->d_inode->i_mutex must be held
  */
-- 
cgit v1.3


From 854ff5caabb5974b7464b438aba0bc47f1b6cf34 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 16 Oct 2013 15:48:53 -0400
Subject: exportfs: BUG_ON in crazy corner case

This would indicate a nasty bug in the dcache and has never triggered in
the past 10 years as far as I know.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exportfs/expfs.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index c43fe9b39ff2..6d0a7fa9abb3 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -112,18 +112,14 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
 	while (target_dir->d_flags & DCACHE_DISCONNECTED && noprogress++ < 10) {
 		struct dentry *pd = find_disconnected_root(target_dir);
 
+		BUG_ON(pd == mnt->mnt_sb->s_root);
+
 		if (!IS_ROOT(pd)) {
 			/* must have found a connected parent - great */
 			spin_lock(&pd->d_lock);
 			pd->d_flags &= ~DCACHE_DISCONNECTED;
 			spin_unlock(&pd->d_lock);
 			noprogress = 0;
-		} else if (pd == mnt->mnt_sb->s_root) {
-			printk(KERN_ERR "export: Eeek filesystem root is not connected, impossible\n");
-			spin_lock(&pd->d_lock);
-			pd->d_flags &= ~DCACHE_DISCONNECTED;
-			spin_unlock(&pd->d_lock);
-			noprogress = 0;
 		} else {
 			/*
 			 * We have hit the top of a disconnected path, try to
-- 
cgit v1.3


From 78cee9a8e4b42b3f585ea3bd1c076f5a76fee722 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 22 Oct 2013 20:59:19 -0400
Subject: exportfs: more detailed comment for path_reconnect

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exportfs/expfs.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 6d0a7fa9abb3..87e6dca69e43 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -93,7 +93,19 @@ find_disconnected_root(struct dentry *dentry)
 /*
  * Make sure target_dir is fully connected to the dentry tree.
  *
- * It may already be, as the flag isn't always updated when connection happens.
+ * On successful return, DCACHE_DISCONNECTED will be cleared on
+ * target_dir, and target_dir->d_parent->...->d_parent will reach the
+ * root of the filesystem.
+ *
+ * Whenever DCACHE_DISCONNECTED is unset, target_dir is fully connected.
+ * But the converse is not true: target_dir may have DCACHE_DISCONNECTED
+ * set but already be connected.  In that case we'll verify the
+ * connection to root and then clear the flag.
+ *
+ * Note that target_dir could be removed by a concurrent operation.  In
+ * that case reconnect_path may still succeed with target_dir fully
+ * connected, but further operations using the filehandle will fail when
+ * necessary (due to S_DEAD being set on the directory).
  */
 static int
 reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
-- 
cgit v1.3


From 0dbc018a490ed482a1236aad77ac12e20742b322 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 9 Sep 2013 16:15:13 -0400
Subject: exportfs: clear DISCONNECTED on all parents sooner

Once we've found any connected parent, we know all our parents are
connected--that's true even if there's a concurrent rename.  May as well
clear them all at once and be done with it.

Reviewed-by: Cristoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exportfs/expfs.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 87e6dca69e43..c65b748688ff 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -90,6 +90,24 @@ find_disconnected_root(struct dentry *dentry)
 	return dentry;
 }
 
+static void clear_disconnected(struct dentry *dentry)
+{
+	dget(dentry);
+	while (dentry->d_flags & DCACHE_DISCONNECTED) {
+		struct dentry *parent = dget_parent(dentry);
+
+		WARN_ON_ONCE(IS_ROOT(dentry));
+
+		spin_lock(&dentry->d_lock);
+		dentry->d_flags &= ~DCACHE_DISCONNECTED;
+		spin_unlock(&dentry->d_lock);
+
+		dput(dentry);
+		dentry = parent;
+	}
+	dput(dentry);
+}
+
 /*
  * Make sure target_dir is fully connected to the dentry tree.
  *
@@ -128,10 +146,9 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
 
 		if (!IS_ROOT(pd)) {
 			/* must have found a connected parent - great */
-			spin_lock(&pd->d_lock);
-			pd->d_flags &= ~DCACHE_DISCONNECTED;
-			spin_unlock(&pd->d_lock);
-			noprogress = 0;
+			clear_disconnected(target_dir);
+			dput(pd);
+			break;
 		} else {
 			/*
 			 * We have hit the top of a disconnected path, try to
-- 
cgit v1.3


From a056cc8934c7bd046dc44af559bba163115fde40 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 16 Oct 2013 21:09:30 -0400
Subject: exportfs: stop retrying once we race with rename/remove

There are two places here where we could race with a rename or remove:

	- We could find the parent, but then be removed or renamed away
	  from that parent directory before finding our name in that
	  directory.
	- We could find the parent, and find our name in that parent,
	  but then be renamed or removed before we look ourselves up by
	  that name in that parent.

In both cases the concurrent rename or remove will take care of
reconnecting the directory that we're currently examining.  Our target
directory should then also be connected.  Check this and clear
DISCONNECTED in these cases instead of looping around again.

Note: we *do* need to check that this actually happened if we want to be
robust in the face of corrupted filesystems: a corrupted filesystem
could just return a completely wrong parent, and we want to fail with an
error in that case before starting to clear DISCONNECTED on
non-DISCONNECTED filesystems.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exportfs/expfs.c | 45 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index c65b748688ff..6b5ddd5492bc 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -90,6 +90,23 @@ find_disconnected_root(struct dentry *dentry)
 	return dentry;
 }
 
+static bool dentry_connected(struct dentry *dentry)
+{
+	dget(dentry);
+	while (dentry->d_flags & DCACHE_DISCONNECTED) {
+		struct dentry *parent = dget_parent(dentry);
+
+		dput(dentry);
+		if (IS_ROOT(dentry)) {
+			dput(parent);
+			return false;
+		}
+		dentry = parent;
+	}
+	dput(dentry);
+	return true;
+}
+
 static void clear_disconnected(struct dentry *dentry)
 {
 	dget(dentry);
@@ -189,9 +206,9 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
 				dput(pd);
 				if (err == -ENOENT)
 					/* some race between get_parent and
-					 * get_name?  just try again
+					 * get_name?
 					 */
-					continue;
+					goto out_reconnected;
 				break;
 			}
 			dprintk("%s: found name: %s\n", __func__, nbuf);
@@ -211,12 +228,12 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
 			 * hopefully, npd == pd, though it isn't really
 			 * a problem if it isn't
 			 */
+			dput(npd);
+			dput(ppd);
 			if (npd == pd)
 				noprogress = 0;
 			else
-				printk("%s: npd != pd\n", __func__);
-			dput(npd);
-			dput(ppd);
+				goto out_reconnected;
 			if (IS_ROOT(pd)) {
 				/* something went wrong, we have to give up */
 				dput(pd);
@@ -233,6 +250,24 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
 		return err;
 	}
 
+	return 0;
+out_reconnected:
+	/*
+	 * Someone must have renamed our entry into another parent, in
+	 * which case it has been reconnected by the rename.
+	 *
+	 * Or someone removed it entirely, in which case filehandle
+	 * lookup will succeed but the directory is now IS_DEAD and
+	 * subsequent operations on it will fail.
+	 *
+	 * Alternatively, maybe there was no race at all, and the
+	 * filesystem is just corrupt and gave us a parent that doesn't
+	 * actually contain any entry pointing to this inode.  So,
+	 * double check that this worked and return -ESTALE if not:
+	 */
+	if (!dentry_connected(target_dir))
+		return -ESTALE;
+	clear_disconnected(target_dir);
 	return 0;
 }
 
-- 
cgit v1.3


From e4b70ebeeba954cb9cbcf0f19016bb9c2b8711c1 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 16 Oct 2013 21:20:19 -0400
Subject: exportfs: eliminate unused "noprogress" counter

Note this counter is now being set to 0 on every pass through the loop,
so it no longer serves any useful purpose.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exportfs/expfs.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 6b5ddd5492bc..d8ba88ac10e5 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -145,18 +145,9 @@ static void clear_disconnected(struct dentry *dentry)
 static int
 reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
 {
-	int noprogress = 0;
 	int err = -ESTALE;
 
-	/*
-	 * It is possible that a confused file system might not let us complete
-	 * the path to the root.  For example, if get_parent returns a directory
-	 * in which we cannot find a name for the child.  While this implies a
-	 * very sick filesystem we don't want it to cause knfsd to spin.  Hence
-	 * the noprogress counter.  If we go through the loop 10 times (2 is
-	 * probably enough) without getting anywhere, we just give up
-	 */
-	while (target_dir->d_flags & DCACHE_DISCONNECTED && noprogress++ < 10) {
+	while (target_dir->d_flags & DCACHE_DISCONNECTED) {
 		struct dentry *pd = find_disconnected_root(target_dir);
 
 		BUG_ON(pd == mnt->mnt_sb->s_root);
@@ -230,9 +221,7 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
 			 */
 			dput(npd);
 			dput(ppd);
-			if (npd == pd)
-				noprogress = 0;
-			else
+			if (npd != pd)
 				goto out_reconnected;
 			if (IS_ROOT(pd)) {
 				/* something went wrong, we have to give up */
-- 
cgit v1.3


From bbf7a8a3562f2de49ce24db3be0f514459dd7f8b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 17 Oct 2013 11:13:00 -0400
Subject: exportfs: move most of reconnect_path to helper function

Also replace 3 easily-confused three-letter acronyms by more helpful
variable names.

Just cleanup, no change in functionality, with one exception: the
dentry_connected() check in the "out_reconnected" case will now only
check the ancestors of the current dentry instead of checking all the
way from target_dir.  Since we've already verified connectivity up to
this dentry, that should be sufficient.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exportfs/expfs.c | 164 +++++++++++++++++++++++++++-------------------------
 1 file changed, 86 insertions(+), 78 deletions(-)

(limited to 'fs')

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index d8ba88ac10e5..d32ead9026f0 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -125,6 +125,86 @@ static void clear_disconnected(struct dentry *dentry)
 	dput(dentry);
 }
 
+/*
+ * Reconnect a directory dentry with its parent.
+ *
+ * This can return a dentry, or NULL, or an error.
+ *
+ * In the first case the returned dentry is the parent of the given
+ * dentry, and may itself need to be reconnected to its parent.
+ *
+ * In the NULL case, a concurrent VFS operation has either renamed or
+ * removed this directory.  The concurrent operation has reconnected our
+ * dentry, so we no longer need to.
+ */
+static struct dentry *reconnect_one(struct vfsmount *mnt,
+		struct dentry *dentry, char *nbuf)
+{
+	struct dentry *parent;
+	struct dentry *tmp;
+	int err;
+
+	parent = ERR_PTR(-EACCES);
+	mutex_lock(&dentry->d_inode->i_mutex);
+	if (mnt->mnt_sb->s_export_op->get_parent)
+		parent = mnt->mnt_sb->s_export_op->get_parent(dentry);
+	mutex_unlock(&dentry->d_inode->i_mutex);
+
+	if (IS_ERR(parent)) {
+		dprintk("%s: get_parent of %ld failed, err %d\n",
+			__func__, dentry->d_inode->i_ino, PTR_ERR(parent));
+		return parent;
+	}
+
+	dprintk("%s: find name of %lu in %lu\n", __func__,
+		dentry->d_inode->i_ino, parent->d_inode->i_ino);
+	err = exportfs_get_name(mnt, parent, nbuf, dentry);
+	if (err == -ENOENT)
+		goto out_reconnected;
+	if (err)
+		goto out_err;
+	dprintk("%s: found name: %s\n", __func__, nbuf);
+	mutex_lock(&parent->d_inode->i_mutex);
+	tmp = lookup_one_len(nbuf, parent, strlen(nbuf));
+	mutex_unlock(&parent->d_inode->i_mutex);
+	if (IS_ERR(tmp)) {
+		dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp));
+		goto out_err;
+	}
+	if (tmp != dentry) {
+		dput(tmp);
+		goto out_reconnected;
+	}
+	dput(tmp);
+	if (IS_ROOT(dentry)) {
+		err = -ESTALE;
+		goto out_err;
+	}
+	return parent;
+
+out_err:
+	dput(parent);
+	return ERR_PTR(err);
+out_reconnected:
+	dput(parent);
+	/*
+	 * Someone must have renamed our entry into another parent, in
+	 * which case it has been reconnected by the rename.
+	 *
+	 * Or someone removed it entirely, in which case filehandle
+	 * lookup will succeed but the directory is now IS_DEAD and
+	 * subsequent operations on it will fail.
+	 *
+	 * Alternatively, maybe there was no race at all, and the
+	 * filesystem is just corrupt and gave us a parent that doesn't
+	 * actually contain any entry pointing to this inode.  So,
+	 * double check that this worked and return -ESTALE if not:
+	 */
+	if (!dentry_connected(dentry))
+		return ERR_PTR(-ESTALE);
+	return NULL;
+}
+
 /*
  * Make sure target_dir is fully connected to the dentry tree.
  *
@@ -158,76 +238,19 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
 			dput(pd);
 			break;
 		} else {
+			struct dentry *parent;
 			/*
 			 * We have hit the top of a disconnected path, try to
 			 * find parent and connect.
-			 *
-			 * Racing with some other process renaming a directory
-			 * isn't much of a problem here.  If someone renames
-			 * the directory, it will end up properly connected,
-			 * which is what we want
-			 *
-			 * Getting the parent can't be supported generically,
-			 * the locking is too icky.
-			 *
-			 * Instead we just return EACCES.  If server reboots
-			 * or inodes get flushed, you lose
-			 */
-			struct dentry *ppd = ERR_PTR(-EACCES);
-			struct dentry *npd;
-
-			mutex_lock(&pd->d_inode->i_mutex);
-			if (mnt->mnt_sb->s_export_op->get_parent)
-				ppd = mnt->mnt_sb->s_export_op->get_parent(pd);
-			mutex_unlock(&pd->d_inode->i_mutex);
-
-			if (IS_ERR(ppd)) {
-				err = PTR_ERR(ppd);
-				dprintk("%s: get_parent of %ld failed, err %d\n",
-					__func__, pd->d_inode->i_ino, err);
-				dput(pd);
-				break;
-			}
-
-			dprintk("%s: find name of %lu in %lu\n", __func__,
-				pd->d_inode->i_ino, ppd->d_inode->i_ino);
-			err = exportfs_get_name(mnt, ppd, nbuf, pd);
-			if (err) {
-				dput(ppd);
-				dput(pd);
-				if (err == -ENOENT)
-					/* some race between get_parent and
-					 * get_name?
-					 */
-					goto out_reconnected;
-				break;
-			}
-			dprintk("%s: found name: %s\n", __func__, nbuf);
-			mutex_lock(&ppd->d_inode->i_mutex);
-			npd = lookup_one_len(nbuf, ppd, strlen(nbuf));
-			mutex_unlock(&ppd->d_inode->i_mutex);
-			if (IS_ERR(npd)) {
-				err = PTR_ERR(npd);
-				dprintk("%s: lookup failed: %d\n",
-					__func__, err);
-				dput(ppd);
-				dput(pd);
-				break;
-			}
-			/* we didn't really want npd, we really wanted
-			 * a side-effect of the lookup.
-			 * hopefully, npd == pd, though it isn't really
-			 * a problem if it isn't
 			 */
-			dput(npd);
-			dput(ppd);
-			if (npd != pd)
+			 parent = reconnect_one(mnt, pd, nbuf);
+			 if (!parent)
 				goto out_reconnected;
-			if (IS_ROOT(pd)) {
-				/* something went wrong, we have to give up */
-				dput(pd);
+			if (IS_ERR(parent)) {
+				err = PTR_ERR(parent);
 				break;
 			}
+			dput(parent);
 		}
 		dput(pd);
 	}
@@ -241,21 +264,6 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
 
 	return 0;
 out_reconnected:
-	/*
-	 * Someone must have renamed our entry into another parent, in
-	 * which case it has been reconnected by the rename.
-	 *
-	 * Or someone removed it entirely, in which case filehandle
-	 * lookup will succeed but the directory is now IS_DEAD and
-	 * subsequent operations on it will fail.
-	 *
-	 * Alternatively, maybe there was no race at all, and the
-	 * filesystem is just corrupt and gave us a parent that doesn't
-	 * actually contain any entry pointing to this inode.  So,
-	 * double check that this worked and return -ESTALE if not:
-	 */
-	if (!dentry_connected(target_dir))
-		return -ESTALE;
 	clear_disconnected(target_dir);
 	return 0;
 }
-- 
cgit v1.3


From efbf201f7a0be7ffc6532e672fbccb0eed4f5de0 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 17 Oct 2013 21:42:35 -0400
Subject: exportfs: better variable name

Replace another unhelpful acronym.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exportfs/expfs.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index d32ead9026f0..b33b9c4deea0 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -228,14 +228,14 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
 	int err = -ESTALE;
 
 	while (target_dir->d_flags & DCACHE_DISCONNECTED) {
-		struct dentry *pd = find_disconnected_root(target_dir);
+		struct dentry *dentry = find_disconnected_root(target_dir);
 
-		BUG_ON(pd == mnt->mnt_sb->s_root);
+		BUG_ON(dentry == mnt->mnt_sb->s_root);
 
-		if (!IS_ROOT(pd)) {
+		if (!IS_ROOT(dentry)) {
 			/* must have found a connected parent - great */
 			clear_disconnected(target_dir);
-			dput(pd);
+			dput(dentry);
 			break;
 		} else {
 			struct dentry *parent;
@@ -243,7 +243,7 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
 			 * We have hit the top of a disconnected path, try to
 			 * find parent and connect.
 			 */
-			 parent = reconnect_one(mnt, pd, nbuf);
+			 parent = reconnect_one(mnt, dentry, nbuf);
 			 if (!parent)
 				goto out_reconnected;
 			if (IS_ERR(parent)) {
@@ -252,7 +252,7 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
 			}
 			dput(parent);
 		}
-		dput(pd);
+		dput(dentry);
 	}
 
 	if (target_dir->d_flags & DCACHE_DISCONNECTED) {
-- 
cgit v1.3


From f27c9298fd717e1f7e63e314a7a85a3a7e77139d Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 17 Oct 2013 21:34:21 -0400
Subject: exportfs: fix quadratic behavior in filehandle lookup

Suppose we're given the filehandle for a directory whose closest
ancestor in the dcache is its Nth ancestor.

The main loop in reconnect_path searches for an IS_ROOT ancestor of
target_dir, reconnects that ancestor to its parent, then recommences the
search for an IS_ROOT ancestor from target_dir.

This behavior is quadratic in N.  And there's really no need to restart
the search from target_dir each time: once a directory has been looked
up, it won't become IS_ROOT again.  So instead of starting from
target_dir each time, we can continue where we left off.

This simplifies the code and improves performance on very deep directory
heirachies.  (I can't think of any reason anyone should need heirarchies
a hundred or more deep, but the performance improvement may be valuable
if only to limit damage in case of abuse.)

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exportfs/expfs.c | 66 +++++++++++------------------------------------------
 1 file changed, 13 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index b33b9c4deea0..48a359dd286e 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -69,27 +69,6 @@ find_acceptable_alias(struct dentry *result,
 	return NULL;
 }
 
-/*
- * Find root of a disconnected subtree and return a reference to it.
- */
-static struct dentry *
-find_disconnected_root(struct dentry *dentry)
-{
-	dget(dentry);
-	while (!IS_ROOT(dentry)) {
-		struct dentry *parent = dget_parent(dentry);
-
-		if (!(parent->d_flags & DCACHE_DISCONNECTED)) {
-			dput(parent);
-			break;
-		}
-
-		dput(dentry);
-		dentry = parent;
-	}
-	return dentry;
-}
-
 static bool dentry_connected(struct dentry *dentry)
 {
 	dget(dentry);
@@ -225,45 +204,26 @@ out_reconnected:
 static int
 reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
 {
-	int err = -ESTALE;
+	struct dentry *dentry, *parent;
 
-	while (target_dir->d_flags & DCACHE_DISCONNECTED) {
-		struct dentry *dentry = find_disconnected_root(target_dir);
+	dentry = dget(target_dir);
 
+	while (dentry->d_flags & DCACHE_DISCONNECTED) {
 		BUG_ON(dentry == mnt->mnt_sb->s_root);
 
-		if (!IS_ROOT(dentry)) {
-			/* must have found a connected parent - great */
-			clear_disconnected(target_dir);
-			dput(dentry);
+		if (IS_ROOT(dentry))
+			parent = reconnect_one(mnt, dentry, nbuf);
+		else
+			parent = dget_parent(dentry);
+
+		if (!parent)
 			break;
-		} else {
-			struct dentry *parent;
-			/*
-			 * We have hit the top of a disconnected path, try to
-			 * find parent and connect.
-			 */
-			 parent = reconnect_one(mnt, dentry, nbuf);
-			 if (!parent)
-				goto out_reconnected;
-			if (IS_ERR(parent)) {
-				err = PTR_ERR(parent);
-				break;
-			}
-			dput(parent);
-		}
 		dput(dentry);
+		if (IS_ERR(parent))
+			return PTR_ERR(parent);
+		dentry = parent;
 	}
-
-	if (target_dir->d_flags & DCACHE_DISCONNECTED) {
-		/* something went wrong - oh-well */
-		if (!err)
-			err = -ESTALE;
-		return err;
-	}
-
-	return 0;
-out_reconnected:
+	dput(dentry);
 	clear_disconnected(target_dir);
 	return 0;
 }
-- 
cgit v1.3


From 375e289ea85166c5241c570940e7e7e966c63a9f Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 18 Apr 2012 15:16:33 -0400
Subject: vfs: pull ext4's double-i_mutex-locking into common code

We want to do this elsewhere as well.

Also catch any attempts to use it for directories (where this ordering
would conflict with ancestor-first directory ordering in lock_rename).

Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Dave Chinner <david@fromorbit.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Acked-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext4/ext4.h        |  2 --
 fs/ext4/ioctl.c       |  4 ++--
 fs/ext4/move_extent.c | 40 ++--------------------------------------
 fs/inode.c            | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/fs.h    |  3 +++
 5 files changed, 43 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index af815ea9d7cc..d01d62315f7e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2734,8 +2734,6 @@ extern void ext4_double_down_write_data_sem(struct inode *first,
 					    struct inode *second);
 extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
 					  struct inode *donor_inode);
-void ext4_inode_double_lock(struct inode *inode1, struct inode *inode2);
-void ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2);
 extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 			     __u64 start_orig, __u64 start_donor,
 			     __u64 len, __u64 *moved_len);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a569d335f804..60589b60e9b0 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -130,7 +130,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
 
 	/* Protect orig inodes against a truncate and make sure,
 	 * that only 1 swap_inode_boot_loader is running. */
-	ext4_inode_double_lock(inode, inode_bl);
+	lock_two_nondirectories(inode, inode_bl);
 
 	truncate_inode_pages(&inode->i_data, 0);
 	truncate_inode_pages(&inode_bl->i_data, 0);
@@ -205,7 +205,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
 	ext4_inode_resume_unlocked_dio(inode);
 	ext4_inode_resume_unlocked_dio(inode_bl);
 
-	ext4_inode_double_unlock(inode, inode_bl);
+	unlock_two_nondirectories(inode, inode_bl);
 
 	iput(inode_bl);
 
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 7fa4d855dbd5..773b503bd18c 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -1202,42 +1202,6 @@ mext_check_arguments(struct inode *orig_inode,
 	return 0;
 }
 
-/**
- * ext4_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
- *
- * @inode1:	the inode structure
- * @inode2:	the inode structure
- *
- * Lock two inodes' i_mutex
- */
-void
-ext4_inode_double_lock(struct inode *inode1, struct inode *inode2)
-{
-	BUG_ON(inode1 == inode2);
-	if (inode1 < inode2) {
-		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
-	} else {
-		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
-	}
-}
-
-/**
- * ext4_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
- *
- * @inode1:     the inode that is released first
- * @inode2:     the inode that is released second
- *
- */
-
-void
-ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2)
-{
-	mutex_unlock(&inode1->i_mutex);
-	mutex_unlock(&inode2->i_mutex);
-}
-
 /**
  * ext4_move_extents - Exchange the specified range of a file
  *
@@ -1327,7 +1291,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		return -EINVAL;
 	}
 	/* Protect orig and donor inodes against a truncate */
-	ext4_inode_double_lock(orig_inode, donor_inode);
+	lock_two_nondirectories(orig_inode, donor_inode);
 
 	/* Wait for all existing dio workers */
 	ext4_inode_block_unlocked_dio(orig_inode);
@@ -1535,7 +1499,7 @@ out:
 	ext4_double_up_write_data_sem(orig_inode, donor_inode);
 	ext4_inode_resume_unlocked_dio(orig_inode);
 	ext4_inode_resume_unlocked_dio(donor_inode);
-	ext4_inode_double_unlock(orig_inode, donor_inode);
+	unlock_two_nondirectories(orig_inode, donor_inode);
 
 	return ret;
 }
diff --git a/fs/inode.c b/fs/inode.c
index bf7a8b3ab937..ef25afdf3906 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -942,6 +942,42 @@ void unlock_new_inode(struct inode *inode)
 }
 EXPORT_SYMBOL(unlock_new_inode);
 
+/**
+ * lock_two_nondirectories - take two i_mutexes on non-directory objects
+ * @inode1: first inode to lock
+ * @inode2: second inode to lock
+ */
+void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
+{
+	WARN_ON_ONCE(S_ISDIR(inode1->i_mode));
+	if (inode1 == inode2 || !inode2) {
+		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+		return;
+	}
+	WARN_ON_ONCE(S_ISDIR(inode2->i_mode));
+	if (inode1 < inode2) {
+		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+	} else {
+		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
+	}
+}
+EXPORT_SYMBOL(lock_two_nondirectories);
+
+/**
+ * unlock_two_nondirectories - release locks from lock_two_nondirectories()
+ * @inode1: first inode to unlock
+ * @inode2: second inode to unlock
+ */
+void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
+{
+	mutex_unlock(&inode1->i_mutex);
+	if (inode2 && inode2 != inode1)
+		mutex_unlock(&inode2->i_mutex);
+}
+EXPORT_SYMBOL(unlock_two_nondirectories);
+
 /**
  * iget5_locked - obtain an inode from a mounted file system
  * @sb:		super block of file system
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5e44b0893db8..4e1a0b41f966 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -637,6 +637,9 @@ enum inode_i_mutex_lock_class
 	I_MUTEX_QUOTA
 };
 
+void lock_two_nondirectories(struct inode *, struct inode*);
+void unlock_two_nondirectories(struct inode *, struct inode*);
+
 /*
  * NOTE: in a 32bit arch with a preemptable kernel and
  * an UP compile the i_size_read/write must be atomic
-- 
cgit v1.3


From 275555163e3ac09c016c964a10e0f389cf152c4c Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 25 Apr 2012 07:19:52 -0400
Subject: vfs: don't use PARENT/CHILD lock classes for non-directories

Reserve I_MUTEX_PARENT and I_MUTEX_CHILD for locking of actual
directories.

(Also I_MUTEX_QUOTA isn't really a meaningful name for this locking
class any more; fixed in a later patch.)

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index ef25afdf3906..51f329b98aa4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -951,16 +951,16 @@ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
 {
 	WARN_ON_ONCE(S_ISDIR(inode1->i_mode));
 	if (inode1 == inode2 || !inode2) {
-		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+		mutex_lock(&inode1->i_mutex);
 		return;
 	}
 	WARN_ON_ONCE(S_ISDIR(inode2->i_mode));
 	if (inode1 < inode2) {
-		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+		mutex_lock(&inode1->i_mutex);
+		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_QUOTA);
 	} else {
-		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
+		mutex_lock(&inode2->i_mutex);
+		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_QUOTA);
 	}
 }
 EXPORT_SYMBOL(lock_two_nondirectories);
-- 
cgit v1.3


From 40bd22c9f8617ddd5da06044c81f72a2cf700791 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 18 Apr 2012 15:21:34 -0400
Subject: vfs: rename I_MUTEX_QUOTA now that it's not used for quotas

I_MUTEX_QUOTA is now just being used whenever we want to lock two
non-directories.  So the name isn't right.  I_MUTEX_NONDIR2 isn't
especially elegant but it's the best I could think of.

Also fix some outdated documentation.

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         | 4 ++--
 include/linux/fs.h | 9 ++++++---
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 51f329b98aa4..ce48c359ce9e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -957,10 +957,10 @@ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
 	WARN_ON_ONCE(S_ISDIR(inode2->i_mode));
 	if (inode1 < inode2) {
 		mutex_lock(&inode1->i_mutex);
-		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_QUOTA);
+		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2);
 	} else {
 		mutex_lock(&inode2->i_mutex);
-		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_QUOTA);
+		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_NONDIR2);
 	}
 }
 EXPORT_SYMBOL(lock_two_nondirectories);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4e1a0b41f966..ed7f94af1ab2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -623,10 +623,13 @@ static inline int inode_unhashed(struct inode *inode)
  * 0: the object of the current VFS operation
  * 1: parent
  * 2: child/target
- * 3: quota file
+ * 3: xattr
+ * 4: second non-directory
+ * The last is for certain operations (such as rename) which lock two
+ * non-directories at once.
  *
  * The locking order between these classes is
- * parent -> child -> normal -> xattr -> quota
+ * parent -> child -> normal -> xattr -> second non-directory
  */
 enum inode_i_mutex_lock_class
 {
@@ -634,7 +637,7 @@ enum inode_i_mutex_lock_class
 	I_MUTEX_PARENT,
 	I_MUTEX_CHILD,
 	I_MUTEX_XATTR,
-	I_MUTEX_QUOTA
+	I_MUTEX_NONDIR2
 };
 
 void lock_two_nondirectories(struct inode *, struct inode*);
-- 
cgit v1.3


From 6cedba8962f440c72447f811d0d530a8a9dc637a Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 5 Mar 2012 11:40:41 -0500
Subject: vfs: take i_mutex on renamed file

A read delegation is used by NFSv4 as a guarantee that a client can
perform local read opens without informing the server.

The open operation takes the last component of the pathname as an
argument, thus is also a lookup operation, and giving the client the
above guarantee means informing the client before we allow anything that
would change the set of names pointing to the inode.

Therefore, we need to break delegations on rename, link, and unlink.

We also need to prevent new delegations from being acquired while one of
these operations is in progress.

We could add some completely new locking for that purpose, but it's
simpler to use the i_mutex, since that's already taken by all the
operations we care about.

The single exception is rename.  So, modify rename to take the i_mutex
on the file that is being renamed.

Also fix up lockdep and Documentation/filesystems/directory-locking to
reflect the change.

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/directory-locking | 31 ++++++++++++++++++++---------
 fs/namei.c                                  | 10 +++++-----
 2 files changed, 27 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/directory-locking b/Documentation/filesystems/directory-locking
index ff7b611abf33..09bbf9a54f80 100644
--- a/Documentation/filesystems/directory-locking
+++ b/Documentation/filesystems/directory-locking
@@ -2,6 +2,10 @@
 kinds of locks - per-inode (->i_mutex) and per-filesystem
 (->s_vfs_rename_mutex).
 
+	When taking the i_mutex on multiple non-directory objects, we
+always acquire the locks in order by increasing address.  We'll call
+that "inode pointer" order in the following.
+
 	For our purposes all operations fall in 5 classes:
 
 1) read access.  Locking rules: caller locks directory we are accessing.
@@ -12,8 +16,9 @@ kinds of locks - per-inode (->i_mutex) and per-filesystem
 locks victim and calls the method.
 
 4) rename() that is _not_ cross-directory.  Locking rules: caller locks
-the parent, finds source and target, if target already exists - locks it
-and then calls the method.
+the parent and finds source and target.  If target already exists, lock
+it.  If source is a non-directory, lock it.  If that means we need to
+lock both, lock them in inode pointer order.
 
 5) link creation.  Locking rules:
 	* lock parent
@@ -30,7 +35,9 @@ rules:
 		fail with -ENOTEMPTY
 	* if new parent is equal to or is a descendent of source
 		fail with -ELOOP
-	* if target exists - lock it.
+	* If target exists, lock it.  If source is a non-directory, lock
+	  it.  In case that means we need to lock both source and target,
+	  do so in inode pointer order.
 	* call the method.
 
 
@@ -56,9 +63,11 @@ objects - A < B iff A is an ancestor of B.
     renames will be blocked on filesystem lock and we don't start changing
     the order until we had acquired all locks).
 
-(3) any operation holds at most one lock on non-directory object and
-    that lock is acquired after all other locks.  (Proof: see descriptions
-    of operations).
+(3) locks on non-directory objects are acquired only after locks on
+    directory objects, and are acquired in inode pointer order.
+    (Proof: all operations but renames take lock on at most one
+    non-directory object, except renames, which take locks on source and
+    target in inode pointer order in the case they are not directories.)
 
 	Now consider the minimal deadlock.  Each process is blocked on
 attempt to acquire some lock and already holds at least one lock.  Let's
@@ -66,9 +75,13 @@ consider the set of contended locks.  First of all, filesystem lock is
 not contended, since any process blocked on it is not holding any locks.
 Thus all processes are blocked on ->i_mutex.
 
-	Non-directory objects are not contended due to (3).  Thus link
-creation can't be a part of deadlock - it can't be blocked on source
-and it means that it doesn't hold any locks.
+	By (3), any process holding a non-directory lock can only be
+waiting on another non-directory lock with a larger address.  Therefore
+the process holding the "largest" such lock can always make progress, and
+non-directory objects are not included in the set of contended locks.
+
+	Thus link creation can't be a part of deadlock - it can't be
+blocked on source and it means that it doesn't hold any locks.
 
 	Any contended object is either held by cross-directory rename or
 has a child that is also contended.  Indeed, suppose that it is held by
diff --git a/fs/namei.c b/fs/namei.c
index 2a5a7aa9f43f..88cec0330bf7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3918,7 +3918,8 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
  *	   That's where 4.4 screws up. Current fix: serialization on
  *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
  *	   story.
- *	c) we have to lock _three_ objects - parents and victim (if it exists).
+ *	c) we have to lock _four_ objects - parents and victim (if it exists),
+ *	   and source (if it is not a directory).
  *	   And that - after we got ->i_mutex on parents (until then we don't know
  *	   whether the target exists).  Solution: try to be smart with locking
  *	   order for inodes.  We rely on the fact that tree topology may change
@@ -3994,6 +3995,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
 			    struct inode *new_dir, struct dentry *new_dentry)
 {
 	struct inode *target = new_dentry->d_inode;
+	struct inode *source = old_dentry->d_inode;
 	int error;
 
 	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
@@ -4001,8 +4003,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
 		return error;
 
 	dget(new_dentry);
-	if (target)
-		mutex_lock(&target->i_mutex);
+	lock_two_nondirectories(source, target);
 
 	error = -EBUSY;
 	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
@@ -4017,8 +4018,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
 	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
 		d_move(old_dentry, new_dentry);
 out:
-	if (target)
-		mutex_unlock(&target->i_mutex);
+	unlock_two_nondirectories(source, target);
 	dput(new_dentry);
 	return error;
 }
-- 
cgit v1.3


From 617588d5186c887eb94321b021bb5a46f896f4b3 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 1 Jul 2011 15:18:34 -0400
Subject: locks: introduce new FL_DELEG lock flag

For now FL_DELEG is just a synonym for FL_LEASE.  So this patch doesn't
change behavior.

Next we'll modify break_lease to treat FL_DELEG leases differently, to
account for the fact that NFSv4 delegations should be broken in more
situations than Windows oplocks.

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/locks.c          | 2 +-
 fs/nfsd/nfs4state.c | 2 +-
 include/linux/fs.h  | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index f155ffd648bb..079abcd8a836 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -134,7 +134,7 @@
 
 #define IS_POSIX(fl)	(fl->fl_flags & FL_POSIX)
 #define IS_FLOCK(fl)	(fl->fl_flags & FL_FLOCK)
-#define IS_LEASE(fl)	(fl->fl_flags & FL_LEASE)
+#define IS_LEASE(fl)	(fl->fl_flags & (FL_LEASE|FL_DELEG))
 
 static bool lease_breaking(struct file_lock *fl)
 {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a601fd49f997..f36a30a9f2d1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3008,7 +3008,7 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int f
 		return NULL;
 	locks_init_lock(fl);
 	fl->fl_lmops = &nfsd_lease_mng_ops;
-	fl->fl_flags = FL_LEASE;
+	fl->fl_flags = FL_DELEG;
 	fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
 	fl->fl_end = OFFSET_MAX;
 	fl->fl_owner = (fl_owner_t)(dp->dl_file);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ed7f94af1ab2..129e150f9e94 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -880,6 +880,7 @@ static inline int file_check_writeable(struct file *filp)
 
 #define FL_POSIX	1
 #define FL_FLOCK	2
+#define FL_DELEG	4	/* NFSv4 delegation */
 #define FL_ACCESS	8	/* not trying to lock, just looking */
 #define FL_EXISTS	16	/* when unlocking, test for existence */
 #define FL_LEASE	32	/* lease held on this file */
-- 
cgit v1.3


From df4e8d2c1d2bbbbace706bfe5417320c9e3fbee3 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 5 Mar 2012 13:18:59 -0500
Subject: locks: implement delegations

Implement NFSv4 delegations at the vfs level using the new FL_DELEG lock
type.

Note nfsd is the only delegation user and is only using read
delegations.  Warn on any attempt to set a write delegation for now.
We'll come back to that case later.

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/locks.c         | 55 ++++++++++++++++++++++++++++++++++++++++++++----------
 include/linux/fs.h | 18 +++++++++++++++---
 2 files changed, 60 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 079abcd8a836..f99d52bdd05a 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1292,28 +1292,40 @@ static void time_out_leases(struct inode *inode)
 	}
 }
 
+static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
+{
+	if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE))
+		return false;
+	return locks_conflict(breaker, lease);
+}
+
 /**
  *	__break_lease	-	revoke all outstanding leases on file
  *	@inode: the inode of the file to return
- *	@mode: the open mode (read or write)
+ *	@mode: O_RDONLY: break only write leases; O_WRONLY or O_RDWR:
+ *	    break all leases
+ *	@type: FL_LEASE: break leases and delegations; FL_DELEG: break
+ *	    only delegations
  *
  *	break_lease (inlined for speed) has checked there already is at least
  *	some kind of lock (maybe a lease) on this file.  Leases are broken on
  *	a call to open() or truncate().  This function can sleep unless you
  *	specified %O_NONBLOCK to your open().
  */
-int __break_lease(struct inode *inode, unsigned int mode)
+int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 {
 	int error = 0;
 	struct file_lock *new_fl, *flock;
 	struct file_lock *fl;
 	unsigned long break_time;
 	int i_have_this_lease = 0;
+	bool lease_conflict = false;
 	int want_write = (mode & O_ACCMODE) != O_RDONLY;
 
 	new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
 	if (IS_ERR(new_fl))
 		return PTR_ERR(new_fl);
+	new_fl->fl_flags = type;
 
 	spin_lock(&inode->i_lock);
 
@@ -1323,13 +1335,16 @@ int __break_lease(struct inode *inode, unsigned int mode)
 	if ((flock == NULL) || !IS_LEASE(flock))
 		goto out;
 
-	if (!locks_conflict(flock, new_fl))
+	for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) {
+		if (leases_conflict(fl, new_fl)) {
+			lease_conflict = true;
+			if (fl->fl_owner == current->files)
+				i_have_this_lease = 1;
+		}
+	}
+	if (!lease_conflict)
 		goto out;
 
-	for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next)
-		if (fl->fl_owner == current->files)
-			i_have_this_lease = 1;
-
 	break_time = 0;
 	if (lease_break_time > 0) {
 		break_time = jiffies + lease_break_time * HZ;
@@ -1338,6 +1353,8 @@ int __break_lease(struct inode *inode, unsigned int mode)
 	}
 
 	for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) {
+		if (!leases_conflict(fl, new_fl))
+			continue;
 		if (want_write) {
 			if (fl->fl_flags & FL_UNLOCK_PENDING)
 				continue;
@@ -1379,7 +1396,7 @@ restart:
 		 */
 		for (flock = inode->i_flock; flock && IS_LEASE(flock);
 				flock = flock->fl_next) {
-			if (locks_conflict(new_fl, flock))
+			if (leases_conflict(new_fl, flock))
 				goto restart;
 		}
 		error = 0;
@@ -1460,9 +1477,26 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
 	struct file_lock *fl, **before, **my_before = NULL, *lease;
 	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
+	bool is_deleg = (*flp)->fl_flags & FL_DELEG;
 	int error;
 
 	lease = *flp;
+	/*
+	 * In the delegation case we need mutual exclusion with
+	 * a number of operations that take the i_mutex.  We trylock
+	 * because delegations are an optional optimization, and if
+	 * there's some chance of a conflict--we'd rather not
+	 * bother, maybe that's a sign this just isn't a good file to
+	 * hand out a delegation on.
+	 */
+	if (is_deleg && !mutex_trylock(&inode->i_mutex))
+		return -EAGAIN;
+
+	if (is_deleg && arg == F_WRLCK) {
+		/* Write delegations are not currently supported: */
+		WARN_ON_ONCE(1);
+		return -EINVAL;
+	}
 
 	error = -EAGAIN;
 	if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
@@ -1514,9 +1548,10 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
 		goto out;
 
 	locks_insert_lock(before, lease);
-	return 0;
-
+	error = 0;
 out:
+	if (is_deleg)
+		mutex_unlock(&inode->i_mutex);
 	return error;
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 129e150f9e94..8e4be1be1a62 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1022,7 +1022,7 @@ extern int vfs_test_lock(struct file *, struct file_lock *);
 extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *);
 extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl);
 extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl);
-extern int __break_lease(struct inode *inode, unsigned int flags);
+extern int __break_lease(struct inode *inode, unsigned int flags, unsigned int type);
 extern void lease_get_mtime(struct inode *, struct timespec *time);
 extern int generic_setlease(struct file *, long, struct file_lock **);
 extern int vfs_setlease(struct file *, long, struct file_lock **);
@@ -1131,7 +1131,7 @@ static inline int flock_lock_file_wait(struct file *filp,
 	return -ENOLCK;
 }
 
-static inline int __break_lease(struct inode *inode, unsigned int mode)
+static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 {
 	return 0;
 }
@@ -1961,9 +1961,17 @@ static inline int locks_verify_truncate(struct inode *inode,
 static inline int break_lease(struct inode *inode, unsigned int mode)
 {
 	if (inode->i_flock)
-		return __break_lease(inode, mode);
+		return __break_lease(inode, mode, FL_LEASE);
 	return 0;
 }
+
+static inline int break_deleg(struct inode *inode, unsigned int mode)
+{
+	if (inode->i_flock)
+		return __break_lease(inode, mode, FL_DELEG);
+	return 0;
+}
+
 #else /* !CONFIG_FILE_LOCKING */
 static inline int locks_mandatory_locked(struct inode *inode)
 {
@@ -2003,6 +2011,10 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
 	return 0;
 }
 
+static inline int break_deleg(struct inode *inode, unsigned int mode)
+{
+	return 0;
+}
 #endif /* CONFIG_FILE_LOCKING */
 
 /* fs/open.c */
-- 
cgit v1.3


From 9accbb977ab78234b8f298df5f306ed08d06bedb Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 28 Aug 2012 07:03:24 -0400
Subject: namei: minor vfs_unlink cleanup

We'll be using dentry->d_inode in one more place.

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 88cec0330bf7..e633a58d4222 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3617,6 +3617,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
 
 int vfs_unlink(struct inode *dir, struct dentry *dentry)
 {
+	struct inode *target = dentry->d_inode;
 	int error = may_delete(dir, dentry, 0);
 
 	if (error)
@@ -3625,7 +3626,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 	if (!dir->i_op->unlink)
 		return -EPERM;
 
-	mutex_lock(&dentry->d_inode->i_mutex);
+	mutex_lock(&target->i_mutex);
 	if (d_mountpoint(dentry))
 		error = -EBUSY;
 	else {
@@ -3636,11 +3637,11 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 				dont_mount(dentry);
 		}
 	}
-	mutex_unlock(&dentry->d_inode->i_mutex);
+	mutex_unlock(&target->i_mutex);
 
 	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
 	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
-		fsnotify_link_count(dentry->d_inode);
+		fsnotify_link_count(target);
 		d_delete(dentry);
 	}
 
-- 
cgit v1.3


From b21996e36c8e3b92a84e972378bde80b43acd890 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 20 Sep 2011 09:14:34 -0400
Subject: locks: break delegations on unlink

We need to break delegations on any operation that changes the set of
links pointing to an inode.  Start with unlink.

Such operations also hold the i_mutex on a parent directory.  Breaking a
delegation may require waiting for a timeout (by default 90 seconds) in
the case of a unresponsive NFS client.  To avoid blocking all directory
operations, we therefore drop locks before waiting for the delegation.
The logic then looks like:

	acquire locks
	...
	test for delegation; if found:
		take reference on inode
		release locks
		wait for delegation break
		drop reference on inode
		retry

It is possible this could never terminate.  (Even if we take precautions
to prevent another delegation being acquired on the same inode, we could
get a different inode on each retry.)  But this seems very unlikely.

The initial test for a delegation happens after the lock on the target
inode is acquired, but the directory inode may have been acquired
further up the call stack.  We therefore add a "struct inode **"
argument to any intervening functions, which we use to pass the inode
back up to the caller in the case it needs a delegation synchronously
broken.

Cc: David Howells <dhowells@redhat.com>
Cc: Tyler Hicks <tyhicks@canonical.com>
Cc: Dustin Kirkland <dustin.kirkland@gazzang.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/base/devtmpfs.c |  2 +-
 fs/cachefiles/namei.c   |  2 +-
 fs/ecryptfs/inode.c     |  4 ++--
 fs/namei.c              | 42 +++++++++++++++++++++++++++++++++++++++---
 fs/nfsd/vfs.c           |  2 +-
 include/linux/fs.h      |  2 +-
 ipc/mqueue.c            |  2 +-
 7 files changed, 46 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index 7413d065906b..1b8490e2fbde 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -324,7 +324,7 @@ static int handle_remove(const char *nodename, struct device *dev)
 			mutex_lock(&dentry->d_inode->i_mutex);
 			notify_change(dentry, &newattrs);
 			mutex_unlock(&dentry->d_inode->i_mutex);
-			err = vfs_unlink(parent.dentry->d_inode, dentry);
+			err = vfs_unlink(parent.dentry->d_inode, dentry, NULL);
 			if (!err || err == -ENOENT)
 				deleted = 1;
 		}
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index f4a08d7fa2f7..31d480c0e046 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -294,7 +294,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
 		if (ret < 0) {
 			cachefiles_io_error(cache, "Unlink security error");
 		} else {
-			ret = vfs_unlink(dir->d_inode, rep);
+			ret = vfs_unlink(dir->d_inode, rep, NULL);
 
 			if (preemptive)
 				cachefiles_mark_object_buried(cache, rep);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 0f9b66eaa767..dc60b8bd09ec 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -153,7 +153,7 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
 
 	dget(lower_dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
-	rc = vfs_unlink(lower_dir_inode, lower_dentry);
+	rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL);
 	if (rc) {
 		printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
 		goto out_unlock;
@@ -208,7 +208,7 @@ ecryptfs_do_create(struct inode *directory_inode,
 	inode = __ecryptfs_get_inode(lower_dentry->d_inode,
 				     directory_inode->i_sb);
 	if (IS_ERR(inode)) {
-		vfs_unlink(lower_dir_dentry->d_inode, lower_dentry);
+		vfs_unlink(lower_dir_dentry->d_inode, lower_dentry, NULL);
 		goto out_lock;
 	}
 	fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode);
diff --git a/fs/namei.c b/fs/namei.c
index e633a58d4222..67ce331a3ed8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3615,7 +3615,25 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
 	return do_rmdir(AT_FDCWD, pathname);
 }
 
-int vfs_unlink(struct inode *dir, struct dentry *dentry)
+/**
+ * vfs_unlink - unlink a filesystem object
+ * @dir:	parent directory
+ * @dentry:	victim
+ * @delegated_inode: returns victim inode, if the inode is delegated.
+ *
+ * The caller must hold dir->i_mutex.
+ *
+ * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
+ * return a reference to the inode in delegated_inode.  The caller
+ * should then break the delegation on that inode and retry.  Because
+ * breaking a delegation may take a long time, the caller should drop
+ * dir->i_mutex before doing so.
+ *
+ * Alternatively, a caller may pass NULL for delegated_inode.  This may
+ * be appropriate for callers that expect the underlying filesystem not
+ * to be NFS exported.
+ */
+int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
 {
 	struct inode *target = dentry->d_inode;
 	int error = may_delete(dir, dentry, 0);
@@ -3632,11 +3650,20 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 	else {
 		error = security_inode_unlink(dir, dentry);
 		if (!error) {
+			error = break_deleg(target, O_WRONLY|O_NONBLOCK);
+			if (error) {
+				if (error == -EWOULDBLOCK && delegated_inode) {
+					*delegated_inode = target;
+					ihold(target);
+				}
+				goto out;
+			}
 			error = dir->i_op->unlink(dir, dentry);
 			if (!error)
 				dont_mount(dentry);
 		}
 	}
+out:
 	mutex_unlock(&target->i_mutex);
 
 	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
@@ -3661,6 +3688,7 @@ static long do_unlinkat(int dfd, const char __user *pathname)
 	struct dentry *dentry;
 	struct nameidata nd;
 	struct inode *inode = NULL;
+	struct inode *delegated_inode = NULL;
 	unsigned int lookup_flags = 0;
 retry:
 	name = user_path_parent(dfd, pathname, &nd, lookup_flags);
@@ -3675,7 +3703,7 @@ retry:
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto exit1;
-
+retry_deleg:
 	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
 	dentry = lookup_hash(&nd);
 	error = PTR_ERR(dentry);
@@ -3690,13 +3718,21 @@ retry:
 		error = security_path_unlink(&nd.path, dentry);
 		if (error)
 			goto exit2;
-		error = vfs_unlink(nd.path.dentry->d_inode, dentry);
+		error = vfs_unlink(nd.path.dentry->d_inode, dentry, &delegated_inode);
 exit2:
 		dput(dentry);
 	}
 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
 	if (inode)
 		iput(inode);	/* truncate the inode here */
+	inode = NULL;
+	if (delegated_inode) {
+		error = break_deleg(delegated_inode, O_WRONLY);
+		iput(delegated_inode);
+		delegated_inode = NULL;
+		if (!error)
+			goto retry_deleg;
+	}
 	mnt_drop_write(nd.path.mnt);
 exit1:
 	path_put(&nd.path);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 13886f7f40d5..7a810235d599 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1910,7 +1910,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	if (host_err)
 		goto out_put;
 	if (type != S_IFDIR)
-		host_err = vfs_unlink(dirp, rdentry);
+		host_err = vfs_unlink(dirp, rdentry, NULL);
 	else
 		host_err = vfs_rmdir(dirp, rdentry);
 	if (!host_err)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8e4be1be1a62..a5799233142a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1455,7 +1455,7 @@ extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
 extern int vfs_symlink(struct inode *, struct dentry *, const char *);
 extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
 extern int vfs_rmdir(struct inode *, struct dentry *);
-extern int vfs_unlink(struct inode *, struct dentry *);
+extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
 extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
 
 /*
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index ae1996d3c539..95827ce2f3c7 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -886,7 +886,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
 		err = -ENOENT;
 	} else {
 		ihold(inode);
-		err = vfs_unlink(dentry->d_parent->d_inode, dentry);
+		err = vfs_unlink(dentry->d_parent->d_inode, dentry, NULL);
 	}
 	dput(dentry);
 
-- 
cgit v1.3


From 5a14696c1795d3843673b5cf1982d0e5357a5bbf Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 28 Aug 2012 07:50:40 -0700
Subject: locks: helper functions for delegation breaking

We'll need the same logic for rename and link.

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c         | 13 +++----------
 include/linux/fs.h | 39 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 40 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 67ce331a3ed8..cfaeaae0f2db 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3650,14 +3650,9 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate
 	else {
 		error = security_inode_unlink(dir, dentry);
 		if (!error) {
-			error = break_deleg(target, O_WRONLY|O_NONBLOCK);
-			if (error) {
-				if (error == -EWOULDBLOCK && delegated_inode) {
-					*delegated_inode = target;
-					ihold(target);
-				}
+			error = try_break_deleg(target, delegated_inode);
+			if (error)
 				goto out;
-			}
 			error = dir->i_op->unlink(dir, dentry);
 			if (!error)
 				dont_mount(dentry);
@@ -3727,9 +3722,7 @@ exit2:
 		iput(inode);	/* truncate the inode here */
 	inode = NULL;
 	if (delegated_inode) {
-		error = break_deleg(delegated_inode, O_WRONLY);
-		iput(delegated_inode);
-		delegated_inode = NULL;
+		error = break_deleg_wait(&delegated_inode);
 		if (!error)
 			goto retry_deleg;
 	}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a5799233142a..931f919f44e1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1905,6 +1905,9 @@ extern bool fs_fully_visible(struct file_system_type *);
 
 extern int current_umask(void);
 
+extern void ihold(struct inode * inode);
+extern void iput(struct inode *);
+
 /* /sys/fs */
 extern struct kobject *fs_kobj;
 
@@ -1972,6 +1975,28 @@ static inline int break_deleg(struct inode *inode, unsigned int mode)
 	return 0;
 }
 
+static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode)
+{
+	int ret;
+
+	ret = break_deleg(inode, O_WRONLY|O_NONBLOCK);
+	if (ret == -EWOULDBLOCK && delegated_inode) {
+		*delegated_inode = inode;
+		ihold(inode);
+	}
+	return ret;
+}
+
+static inline int break_deleg_wait(struct inode **delegated_inode)
+{
+	int ret;
+
+	ret = break_deleg(*delegated_inode, O_WRONLY);
+	iput(*delegated_inode);
+	*delegated_inode = NULL;
+	return ret;
+}
+
 #else /* !CONFIG_FILE_LOCKING */
 static inline int locks_mandatory_locked(struct inode *inode)
 {
@@ -2015,6 +2040,18 @@ static inline int break_deleg(struct inode *inode, unsigned int mode)
 {
 	return 0;
 }
+
+static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode)
+{
+	return 0;
+}
+
+static inline int break_deleg_wait(struct inode **delegated_inode)
+{
+	BUG();
+	return 0;
+}
+
 #endif /* CONFIG_FILE_LOCKING */
 
 /* fs/open.c */
@@ -2350,8 +2387,6 @@ extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence);
 extern int inode_init_always(struct super_block *, struct inode *);
 extern void inode_init_once(struct inode *);
 extern void address_space_init_once(struct address_space *mapping);
-extern void ihold(struct inode * inode);
-extern void iput(struct inode *);
 extern struct inode * igrab(struct inode *);
 extern ino_t iunique(struct super_block *, ino_t);
 extern int inode_needs_sync(struct inode *inode);
-- 
cgit v1.3


From 8e6d782cab50884ba94324632700e6233a252f6a Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 20 Sep 2011 16:59:58 -0400
Subject: locks: break delegations on rename

Cc: David Howells <dhowells@redhat.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 .../lustre/lustre/include/linux/lustre_compat25.h  |  4 +-
 drivers/staging/lustre/lustre/lvfs/lvfs_linux.c    |  2 +-
 fs/cachefiles/namei.c                              |  2 +-
 fs/ecryptfs/inode.c                                |  3 +-
 fs/namei.c                                         | 47 ++++++++++++++++++++--
 fs/nfsd/vfs.c                                      |  2 +-
 include/linux/fs.h                                 |  2 +-
 7 files changed, 51 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h b/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h
index 9243dfab43d3..80b019bf969c 100644
--- a/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h
@@ -105,8 +105,8 @@ static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
 #define ll_vfs_unlink(inode,entry,mnt)	  vfs_unlink(inode,entry)
 #define ll_vfs_mknod(dir,entry,mnt,mode,dev)    vfs_mknod(dir,entry,mode,dev)
 #define ll_security_inode_unlink(dir,entry,mnt) security_inode_unlink(dir,entry)
-#define ll_vfs_rename(old,old_dir,mnt,new,new_dir,mnt1) \
-		vfs_rename(old,old_dir,new,new_dir)
+#define ll_vfs_rename(old,old_dir,mnt,new,new_dir,mnt1,delegated_inode) \
+		vfs_rename(old,old_dir,new,new_dir,delegated_inode)
 
 #define cfs_bio_io_error(a,b)   bio_io_error((a))
 #define cfs_bio_endio(a,b,c)    bio_endio((a),(c))
diff --git a/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c b/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c
index 18e1b47a1d65..4ed7c9f0a8be 100644
--- a/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c
+++ b/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c
@@ -220,7 +220,7 @@ int lustre_rename(struct dentry *dir, struct vfsmount *mnt,
 		GOTO(put_old, err = PTR_ERR(dchild_new));
 
 	err = ll_vfs_rename(dir->d_inode, dchild_old, mnt,
-			    dir->d_inode, dchild_new, mnt);
+			    dir->d_inode, dchild_new, mnt, NULL);
 
 	dput(dchild_new);
 put_old:
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 31d480c0e046..ca65f39dc8dc 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -396,7 +396,7 @@ try_again:
 		cachefiles_io_error(cache, "Rename security error %d", ret);
 	} else {
 		ret = vfs_rename(dir->d_inode, rep,
-				 cache->graveyard->d_inode, grave);
+				 cache->graveyard->d_inode, grave, NULL);
 		if (ret != 0 && ret != -ENOMEM)
 			cachefiles_io_error(cache,
 					    "Rename failed with error %d", ret);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index dc60b8bd09ec..c23b01bb7e04 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -640,7 +640,8 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto out_lock;
 	}
 	rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry,
-			lower_new_dir_dentry->d_inode, lower_new_dentry);
+			lower_new_dir_dentry->d_inode, lower_new_dentry,
+			NULL);
 	if (rc)
 		goto out_lock;
 	if (target_inode)
diff --git a/fs/namei.c b/fs/namei.c
index cfaeaae0f2db..ce7e580e4e14 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4022,7 +4022,8 @@ out:
 }
 
 static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
-			    struct inode *new_dir, struct dentry *new_dentry)
+			    struct inode *new_dir, struct dentry *new_dentry,
+			    struct inode **delegated_inode)
 {
 	struct inode *target = new_dentry->d_inode;
 	struct inode *source = old_dentry->d_inode;
@@ -4039,6 +4040,14 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
 	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
 		goto out;
 
+	error = try_break_deleg(source, delegated_inode);
+	if (error)
+		goto out;
+	if (target) {
+		error = try_break_deleg(target, delegated_inode);
+		if (error)
+			goto out;
+	}
 	error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
 	if (error)
 		goto out;
@@ -4053,8 +4062,30 @@ out:
 	return error;
 }
 
+/**
+ * vfs_rename - rename a filesystem object
+ * @old_dir:	parent of source
+ * @old_dentry:	source
+ * @new_dir:	parent of destination
+ * @new_dentry:	destination
+ * @delegated_inode: returns an inode needing a delegation break
+ *
+ * The caller must hold multiple mutexes--see lock_rename()).
+ *
+ * If vfs_rename discovers a delegation in need of breaking at either
+ * the source or destination, it will return -EWOULDBLOCK and return a
+ * reference to the inode in delegated_inode.  The caller should then
+ * break the delegation and retry.  Because breaking a delegation may
+ * take a long time, the caller should drop all locks before doing
+ * so.
+ *
+ * Alternatively, a caller may pass NULL for delegated_inode.  This may
+ * be appropriate for callers that expect the underlying filesystem not
+ * to be NFS exported.
+ */
 int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-	       struct inode *new_dir, struct dentry *new_dentry)
+	       struct inode *new_dir, struct dentry *new_dentry,
+	       struct inode **delegated_inode)
 {
 	int error;
 	int is_dir = d_is_directory(old_dentry) || d_is_autodir(old_dentry);
@@ -4082,7 +4113,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (is_dir)
 		error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
 	else
-		error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
+		error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry,delegated_inode);
 	if (!error)
 		fsnotify_move(old_dir, new_dir, old_name, is_dir,
 			      new_dentry->d_inode, old_dentry);
@@ -4098,6 +4129,7 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
 	struct dentry *old_dentry, *new_dentry;
 	struct dentry *trap;
 	struct nameidata oldnd, newnd;
+	struct inode *delegated_inode = NULL;
 	struct filename *from;
 	struct filename *to;
 	unsigned int lookup_flags = 0;
@@ -4137,6 +4169,7 @@ retry:
 	newnd.flags &= ~LOOKUP_PARENT;
 	newnd.flags |= LOOKUP_RENAME_TARGET;
 
+retry_deleg:
 	trap = lock_rename(new_dir, old_dir);
 
 	old_dentry = lookup_hash(&oldnd);
@@ -4173,13 +4206,19 @@ retry:
 	if (error)
 		goto exit5;
 	error = vfs_rename(old_dir->d_inode, old_dentry,
-				   new_dir->d_inode, new_dentry);
+				   new_dir->d_inode, new_dentry,
+				   &delegated_inode);
 exit5:
 	dput(new_dentry);
 exit4:
 	dput(old_dentry);
 exit3:
 	unlock_rename(new_dir, old_dir);
+	if (delegated_inode) {
+		error = break_deleg_wait(&delegated_inode);
+		if (!error)
+			goto retry_deleg;
+	}
 	mnt_drop_write(oldnd.path.mnt);
 exit2:
 	if (retry_estale(error, lookup_flags))
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 7a810235d599..45bf0295894d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1837,7 +1837,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 		if (host_err)
 			goto out_dput_new;
 	}
-	host_err = vfs_rename(fdir, odentry, tdir, ndentry);
+	host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL);
 	if (!host_err) {
 		host_err = commit_metadata(tfhp);
 		if (!host_err)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 931f919f44e1..5bcff883fa90 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1456,7 +1456,7 @@ extern int vfs_symlink(struct inode *, struct dentry *, const char *);
 extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
 extern int vfs_rmdir(struct inode *, struct dentry *);
 extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
-extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
+extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **);
 
 /*
  * VFS dentry helper functions.
-- 
cgit v1.3


From 146a8595c6399ee6ab4b5cc34c0d28aa4835fdc5 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 20 Sep 2011 17:14:31 -0400
Subject: locks: break delegations on link

Cc: Tyler Hicks <tyhicks@canonical.com>
Cc: Dustin Kirkland <dustin.kirkland@gazzang.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ecryptfs/inode.c |  2 +-
 fs/namei.c          | 36 ++++++++++++++++++++++++++++++++----
 fs/nfsd/vfs.c       |  2 +-
 include/linux/fs.h  |  2 +-
 4 files changed, 35 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index c23b01bb7e04..1c628f023041 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -475,7 +475,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
 	dget(lower_new_dentry);
 	lower_dir_dentry = lock_parent(lower_new_dentry);
 	rc = vfs_link(lower_old_dentry, lower_dir_dentry->d_inode,
-		      lower_new_dentry);
+		      lower_new_dentry, NULL);
 	if (rc || !lower_new_dentry->d_inode)
 		goto out_lock;
 	rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
diff --git a/fs/namei.c b/fs/namei.c
index ce7e580e4e14..251178a1e383 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3819,7 +3819,26 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
 	return sys_symlinkat(oldname, AT_FDCWD, newname);
 }
 
-int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
+/**
+ * vfs_link - create a new link
+ * @old_dentry:	object to be linked
+ * @dir:	new parent
+ * @new_dentry:	where to create the new link
+ * @delegated_inode: returns inode needing a delegation break
+ *
+ * The caller must hold dir->i_mutex
+ *
+ * If vfs_link discovers a delegation on the to-be-linked file in need
+ * of breaking, it will return -EWOULDBLOCK and return a reference to the
+ * inode in delegated_inode.  The caller should then break the delegation
+ * and retry.  Because breaking a delegation may take a long time, the
+ * caller should drop the i_mutex before doing so.
+ *
+ * Alternatively, a caller may pass NULL for delegated_inode.  This may
+ * be appropriate for callers that expect the underlying filesystem not
+ * to be NFS exported.
+ */
+int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
 {
 	struct inode *inode = old_dentry->d_inode;
 	unsigned max_links = dir->i_sb->s_max_links;
@@ -3855,8 +3874,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 		error =  -ENOENT;
 	else if (max_links && inode->i_nlink >= max_links)
 		error = -EMLINK;
-	else
-		error = dir->i_op->link(old_dentry, dir, new_dentry);
+	else {
+		error = try_break_deleg(inode, delegated_inode);
+		if (!error)
+			error = dir->i_op->link(old_dentry, dir, new_dentry);
+	}
 
 	if (!error && (inode->i_state & I_LINKABLE)) {
 		spin_lock(&inode->i_lock);
@@ -3883,6 +3905,7 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
 {
 	struct dentry *new_dentry;
 	struct path old_path, new_path;
+	struct inode *delegated_inode = NULL;
 	int how = 0;
 	int error;
 
@@ -3921,9 +3944,14 @@ retry:
 	error = security_path_link(old_path.dentry, &new_path, new_dentry);
 	if (error)
 		goto out_dput;
-	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry);
+	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
 out_dput:
 	done_path_create(&new_path, new_dentry);
+	if (delegated_inode) {
+		error = break_deleg_wait(&delegated_inode);
+		if (!error)
+			goto retry;
+	}
 	if (retry_estale(error, how)) {
 		how |= LOOKUP_REVAL;
 		goto retry;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 45bf0295894d..27ba21b5f383 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1736,7 +1736,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 		err = nfserrno(host_err);
 		goto out_dput;
 	}
-	host_err = vfs_link(dold, dirp, dnew);
+	host_err = vfs_link(dold, dirp, dnew, NULL);
 	if (!host_err) {
 		err = nfserrno(commit_metadata(ffhp));
 		if (!err)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5bcff883fa90..6e36e7118ec1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1453,7 +1453,7 @@ extern int vfs_create(struct inode *, struct dentry *, umode_t, bool);
 extern int vfs_mkdir(struct inode *, struct dentry *, umode_t);
 extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
 extern int vfs_symlink(struct inode *, struct dentry *, const char *);
-extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
+extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **);
 extern int vfs_rmdir(struct inode *, struct dentry *);
 extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
 extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **);
-- 
cgit v1.3


From 27ac0ffeac80ba6b9580529568d06144df044366 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 20 Sep 2011 17:19:26 -0400
Subject: locks: break delegations on any attribute modification

NFSv4 uses leases to guarantee that clients can cache metadata as well
as data.

Cc: Mikulas Patocka <mikulas@artax.karlin.mff.cuni.cz>
Cc: David Howells <dhowells@redhat.com>
Cc: Tyler Hicks <tyhicks@canonical.com>
Cc: Dustin Kirkland <dustin.kirkland@gazzang.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/base/devtmpfs.c   |  4 ++--
 fs/attr.c                 | 25 ++++++++++++++++++++++++-
 fs/cachefiles/interface.c |  4 ++--
 fs/ecryptfs/inode.c       |  4 ++--
 fs/hpfs/namei.c           |  2 +-
 fs/inode.c                |  6 +++++-
 fs/nfsd/vfs.c             |  8 ++++++--
 fs/open.c                 | 22 ++++++++++++++++++----
 fs/utimes.c               |  9 ++++++++-
 include/linux/fs.h        |  2 +-
 10 files changed, 69 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index 1b8490e2fbde..0f3820121e02 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -216,7 +216,7 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid,
 		newattrs.ia_gid = gid;
 		newattrs.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID;
 		mutex_lock(&dentry->d_inode->i_mutex);
-		notify_change(dentry, &newattrs);
+		notify_change(dentry, &newattrs, NULL);
 		mutex_unlock(&dentry->d_inode->i_mutex);
 
 		/* mark as kernel-created inode */
@@ -322,7 +322,7 @@ static int handle_remove(const char *nodename, struct device *dev)
 			newattrs.ia_valid =
 				ATTR_UID|ATTR_GID|ATTR_MODE;
 			mutex_lock(&dentry->d_inode->i_mutex);
-			notify_change(dentry, &newattrs);
+			notify_change(dentry, &newattrs, NULL);
 			mutex_unlock(&dentry->d_inode->i_mutex);
 			err = vfs_unlink(parent.dentry->d_inode, dentry, NULL);
 			if (!err || err == -ENOENT)
diff --git a/fs/attr.c b/fs/attr.c
index 1449adb14ef6..267968d94673 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -167,7 +167,27 @@ void setattr_copy(struct inode *inode, const struct iattr *attr)
 }
 EXPORT_SYMBOL(setattr_copy);
 
-int notify_change(struct dentry * dentry, struct iattr * attr)
+/**
+ * notify_change - modify attributes of a filesytem object
+ * @dentry:	object affected
+ * @iattr:	new attributes
+ * @delegated_inode: returns inode, if the inode is delegated
+ *
+ * The caller must hold the i_mutex on the affected object.
+ *
+ * If notify_change discovers a delegation in need of breaking,
+ * it will return -EWOULDBLOCK and return a reference to the inode in
+ * delegated_inode.  The caller should then break the delegation and
+ * retry.  Because breaking a delegation may take a long time, the
+ * caller should drop the i_mutex before doing so.
+ *
+ * Alternatively, a caller may pass NULL for delegated_inode.  This may
+ * be appropriate for callers that expect the underlying filesystem not
+ * to be NFS exported.  Also, passing NULL is fine for callers holding
+ * the file open for write, as there can be no conflicting delegation in
+ * that case.
+ */
+int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode)
 {
 	struct inode *inode = dentry->d_inode;
 	umode_t mode = inode->i_mode;
@@ -241,6 +261,9 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
 		return 0;
 
 	error = security_inode_setattr(dentry, attr);
+	if (error)
+		return error;
+	error = try_break_deleg(inode, delegated_inode);
 	if (error)
 		return error;
 
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 43eb5592cdea..5088a418ac4d 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -449,14 +449,14 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
 		_debug("discard tail %llx", oi_size);
 		newattrs.ia_valid = ATTR_SIZE;
 		newattrs.ia_size = oi_size & PAGE_MASK;
-		ret = notify_change(object->backer, &newattrs);
+		ret = notify_change(object->backer, &newattrs, NULL);
 		if (ret < 0)
 			goto truncate_failed;
 	}
 
 	newattrs.ia_valid = ATTR_SIZE;
 	newattrs.ia_size = ni_size;
-	ret = notify_change(object->backer, &newattrs);
+	ret = notify_change(object->backer, &newattrs, NULL);
 
 truncate_failed:
 	mutex_unlock(&object->backer->d_inode->i_mutex);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 1c628f023041..c36c44824471 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -882,7 +882,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
 		struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
 
 		mutex_lock(&lower_dentry->d_inode->i_mutex);
-		rc = notify_change(lower_dentry, &lower_ia);
+		rc = notify_change(lower_dentry, &lower_ia, NULL);
 		mutex_unlock(&lower_dentry->d_inode->i_mutex);
 	}
 	return rc;
@@ -983,7 +983,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
 		lower_ia.ia_valid &= ~ATTR_MODE;
 
 	mutex_lock(&lower_dentry->d_inode->i_mutex);
-	rc = notify_change(lower_dentry, &lower_ia);
+	rc = notify_change(lower_dentry, &lower_ia, NULL);
 	mutex_unlock(&lower_dentry->d_inode->i_mutex);
 out:
 	fsstack_copy_attr_all(inode, lower_inode);
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 345713d2f8f3..1b39afdd86fd 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -407,7 +407,7 @@ again:
 			/*printk("HPFS: truncating file before delete.\n");*/
 			newattrs.ia_size = 0;
 			newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
-			err = notify_change(dentry, &newattrs);
+			err = notify_change(dentry, &newattrs, NULL);
 			put_write_access(inode);
 			if (!err)
 				goto again;
diff --git a/fs/inode.c b/fs/inode.c
index ce48c359ce9e..4bcdad3c9361 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1603,7 +1603,11 @@ static int __remove_suid(struct dentry *dentry, int kill)
 	struct iattr newattrs;
 
 	newattrs.ia_valid = ATTR_FORCE | kill;
-	return notify_change(dentry, &newattrs);
+	/*
+	 * Note we call this on write, so notify_change will not
+	 * encounter any conflicting delegations:
+	 */
+	return notify_change(dentry, &newattrs, NULL);
 }
 
 int file_remove_suid(struct file *file)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 27ba21b5f383..94b5f5d2bfed 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -427,7 +427,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 			goto out_nfserr;
 		fh_lock(fhp);
 
-		host_err = notify_change(dentry, iap);
+		host_err = notify_change(dentry, iap, NULL);
 		err = nfserrno(host_err);
 		fh_unlock(fhp);
 	}
@@ -988,7 +988,11 @@ static void kill_suid(struct dentry *dentry)
 	ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
 
 	mutex_lock(&dentry->d_inode->i_mutex);
-	notify_change(dentry, &ia);
+	/*
+	 * Note we call this on write, so notify_change will not
+	 * encounter any conflicting delegations:
+	 */
+	notify_change(dentry, &ia, NULL);
 	mutex_unlock(&dentry->d_inode->i_mutex);
 }
 
diff --git a/fs/open.c b/fs/open.c
index fffbed40dbe9..4b3e1edf2fe4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -57,7 +57,8 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 		newattrs.ia_valid |= ret | ATTR_FORCE;
 
 	mutex_lock(&dentry->d_inode->i_mutex);
-	ret = notify_change(dentry, &newattrs);
+	/* Note any delegations or leases have already been broken: */
+	ret = notify_change(dentry, &newattrs, NULL);
 	mutex_unlock(&dentry->d_inode->i_mutex);
 	return ret;
 }
@@ -464,21 +465,28 @@ out:
 static int chmod_common(struct path *path, umode_t mode)
 {
 	struct inode *inode = path->dentry->d_inode;
+	struct inode *delegated_inode = NULL;
 	struct iattr newattrs;
 	int error;
 
 	error = mnt_want_write(path->mnt);
 	if (error)
 		return error;
+retry_deleg:
 	mutex_lock(&inode->i_mutex);
 	error = security_path_chmod(path, mode);
 	if (error)
 		goto out_unlock;
 	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-	error = notify_change(path->dentry, &newattrs);
+	error = notify_change(path->dentry, &newattrs, &delegated_inode);
 out_unlock:
 	mutex_unlock(&inode->i_mutex);
+	if (delegated_inode) {
+		error = break_deleg_wait(&delegated_inode);
+		if (!error)
+			goto retry_deleg;
+	}
 	mnt_drop_write(path->mnt);
 	return error;
 }
@@ -522,6 +530,7 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
 static int chown_common(struct path *path, uid_t user, gid_t group)
 {
 	struct inode *inode = path->dentry->d_inode;
+	struct inode *delegated_inode = NULL;
 	int error;
 	struct iattr newattrs;
 	kuid_t uid;
@@ -546,12 +555,17 @@ static int chown_common(struct path *path, uid_t user, gid_t group)
 	if (!S_ISDIR(inode->i_mode))
 		newattrs.ia_valid |=
 			ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
+retry_deleg:
 	mutex_lock(&inode->i_mutex);
 	error = security_path_chown(path, uid, gid);
 	if (!error)
-		error = notify_change(path->dentry, &newattrs);
+		error = notify_change(path->dentry, &newattrs, &delegated_inode);
 	mutex_unlock(&inode->i_mutex);
-
+	if (delegated_inode) {
+		error = break_deleg_wait(&delegated_inode);
+		if (!error)
+			goto retry_deleg;
+	}
 	return error;
 }
 
diff --git a/fs/utimes.c b/fs/utimes.c
index f4fb7eca10e8..aa138d64560a 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -53,6 +53,7 @@ static int utimes_common(struct path *path, struct timespec *times)
 	int error;
 	struct iattr newattrs;
 	struct inode *inode = path->dentry->d_inode;
+	struct inode *delegated_inode = NULL;
 
 	error = mnt_want_write(path->mnt);
 	if (error)
@@ -101,9 +102,15 @@ static int utimes_common(struct path *path, struct timespec *times)
 				goto mnt_drop_write_and_out;
 		}
 	}
+retry_deleg:
 	mutex_lock(&inode->i_mutex);
-	error = notify_change(path->dentry, &newattrs);
+	error = notify_change(path->dentry, &newattrs, &delegated_inode);
 	mutex_unlock(&inode->i_mutex);
+	if (delegated_inode) {
+		error = break_deleg_wait(&delegated_inode);
+		if (!error)
+			goto retry_deleg;
+	}
 
 mnt_drop_write_and_out:
 	mnt_drop_write(path->mnt);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6e36e7118ec1..ab2a0ca82dc5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2278,7 +2278,7 @@ extern void emergency_remount(void);
 #ifdef CONFIG_BLOCK
 extern sector_t bmap(struct inode *, sector_t);
 #endif
-extern int notify_change(struct dentry *, struct iattr *);
+extern int notify_change(struct dentry *, struct iattr *, struct inode **);
 extern int inode_permission(struct inode *, int);
 extern int generic_permission(struct inode *, int);
 
-- 
cgit v1.3


From bdd3536618443809d18868563eeafa63b9d29603 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 9 Nov 2013 22:19:48 -0500
Subject: ecryptfs: ->f_op is never NULL

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ecryptfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 6b1cd2b0c751..2229a74aeeed 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -318,7 +318,7 @@ ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	if (ecryptfs_file_to_private(file))
 		lower_file = ecryptfs_file_to_lower(file);
-	if (lower_file->f_op && lower_file->f_op->unlocked_ioctl)
+	if (lower_file->f_op->unlocked_ioctl)
 		rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
 	return rc;
 }
-- 
cgit v1.3


From 29e59c14ae5c21d25db1580d9651b5855d656a30 Mon Sep 17 00:00:00 2001
From: Changman Lee <cm224.lee@samsung.com>
Date: Mon, 11 Nov 2013 09:24:37 +0900
Subject: f2fs: issue more large discard command

o Changes from v1
  Use find_next(_zero)_bit suggested by jg.kim

When f2fs issues discard command, if segment is contiguous,
let's issue more large segment to gather adjacent segments.

** blktrace **
179,1    0     5859    42.619023770   971  C   D 131072 + 2097152 [0]
179,1    0    33665   108.840475468   971  C   D 2228224 + 2494464 [0]
179,1    0    33671   109.131616427   971  C   D 14909440 + 344064 [0]
179,1    0    33677   109.137100677   971  C   D 15261696 + 4096 [0]

Signed-off-by: Changman Lee <cm224.lee@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/segment.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 86dc28949192..fa284d397199 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -139,27 +139,33 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
 void clear_prefree_segments(struct f2fs_sb_info *sbi)
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-	unsigned int segno = -1;
+	unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
 	unsigned int total_segs = TOTAL_SEGS(sbi);
+	unsigned int start = 0, end = -1;
 
 	mutex_lock(&dirty_i->seglist_lock);
+
 	while (1) {
-		segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
-				segno + 1);
-		if (segno >= total_segs)
+		int i;
+		start = find_next_bit(prefree_map, total_segs, end + 1);
+		if (start >= total_segs)
 			break;
+		end = find_next_zero_bit(prefree_map, total_segs, start + 1);
+
+		for (i = start; i < end; i++)
+			clear_bit(i, prefree_map);
+
+		dirty_i->nr_dirty[PRE] -= end - start;
+
+		if (!test_opt(sbi, DISCARD))
+			continue;
 
-		if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE]))
-			dirty_i->nr_dirty[PRE]--;
-
-		/* Let's use trim */
-		if (test_opt(sbi, DISCARD))
-			blkdev_issue_discard(sbi->sb->s_bdev,
-					START_BLOCK(sbi, segno) <<
-					sbi->log_sectors_per_block,
-					1 << (sbi->log_sectors_per_block +
-						sbi->log_blocks_per_seg),
-					GFP_NOFS, 0);
+		blkdev_issue_discard(sbi->sb->s_bdev,
+				START_BLOCK(sbi, start) <<
+				sbi->log_sectors_per_block,
+				(1 << (sbi->log_sectors_per_block +
+				sbi->log_blocks_per_seg)) * (end - start),
+				GFP_NOFS, 0);
 	}
 	mutex_unlock(&dirty_i->seglist_lock);
 }
-- 
cgit v1.3


From eb85d94bdd91fb4dbea4ee465d4349cbea4eaaca Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastry@etersoft.ru>
Date: Wed, 23 Oct 2013 17:49:47 +0400
Subject: CIFS: Fix symbolic links usage

Now we treat any reparse point as a symbolic link and map it to a Unix
one that is not true in a common case due to many reparse point types
supported by SMB servers.

Distinguish reparse point types into two groups:
1) that can be accessed directly through a reparse point
(junctions, deduplicated files, NFS symlinks);
2) that need to be processed manually (Windows symbolic links, DFS);

and map only Windows symbolic links to Unix ones.

Cc: <stable@vger.kernel.org>
Acked-by: Jeff Layton <jlayton@redhat.com>
Reported-and-tested-by: Joao Correia <joaomiguelcorreia@gmail.com>
Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsglob.h  |  2 +-
 fs/cifs/inode.c     | 23 +++++++++++++----------
 fs/cifs/readdir.c   | 40 ++++++++--------------------------------
 fs/cifs/smb1ops.c   | 21 ++++++++++++++++++++-
 fs/cifs/smb2inode.c | 16 ++++++++++++----
 fs/cifs/smb2proto.h |  2 +-
 6 files changed, 55 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 26b1c1dc93f6..cddb807addde 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -261,7 +261,7 @@ struct smb_version_operations {
 	/* query path data from the server */
 	int (*query_path_info)(const unsigned int, struct cifs_tcon *,
 			       struct cifs_sb_info *, const char *,
-			       FILE_ALL_INFO *, bool *);
+			       FILE_ALL_INFO *, bool *, bool *);
 	/* query file data from the server */
 	int (*query_file_info)(const unsigned int, struct cifs_tcon *,
 			       struct cifs_fid *, FILE_ALL_INFO *);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 867b7cdc794a..36f9ebb93ceb 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -542,7 +542,8 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
 /* Fill a cifs_fattr struct with info from FILE_ALL_INFO */
 static void
 cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
-		       struct cifs_sb_info *cifs_sb, bool adjust_tz)
+		       struct cifs_sb_info *cifs_sb, bool adjust_tz,
+		       bool symlink)
 {
 	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
 
@@ -569,7 +570,11 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
 	fattr->cf_createtime = le64_to_cpu(info->CreationTime);
 
 	fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
-	if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
+
+	if (symlink) {
+		fattr->cf_mode = S_IFLNK;
+		fattr->cf_dtype = DT_LNK;
+	} else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
 		fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
 		fattr->cf_dtype = DT_DIR;
 		/*
@@ -578,10 +583,6 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
 		 */
 		if (!tcon->unix_ext)
 			fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
-	} else if (fattr->cf_cifsattrs & ATTR_REPARSE) {
-		fattr->cf_mode = S_IFLNK;
-		fattr->cf_dtype = DT_LNK;
-		fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
 	} else {
 		fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
 		fattr->cf_dtype = DT_REG;
@@ -626,7 +627,8 @@ cifs_get_file_info(struct file *filp)
 	rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data);
 	switch (rc) {
 	case 0:
-		cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
+		cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false,
+				       false);
 		break;
 	case -EREMOTE:
 		cifs_create_dfs_fattr(&fattr, inode->i_sb);
@@ -673,6 +675,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
 	bool adjust_tz = false;
 	struct cifs_fattr fattr;
 	struct cifs_search_info *srchinf = NULL;
+	bool symlink = false;
 
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink))
@@ -702,12 +705,12 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
 		}
 		data = (FILE_ALL_INFO *)buf;
 		rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path,
-						  data, &adjust_tz);
+						  data, &adjust_tz, &symlink);
 	}
 
 	if (!rc) {
-		cifs_all_info_to_fattr(&fattr, (FILE_ALL_INFO *)data, cifs_sb,
-				       adjust_tz);
+		cifs_all_info_to_fattr(&fattr, data, cifs_sb, adjust_tz,
+				       symlink);
 	} else if (rc == -EREMOTE) {
 		cifs_create_dfs_fattr(&fattr, sb);
 		rc = 0;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 53a75f3d0179..5940ecabbe6a 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -134,22 +134,6 @@ out:
 	dput(dentry);
 }
 
-/*
- * Is it possible that this directory might turn out to be a DFS referral
- * once we go to try and use it?
- */
-static bool
-cifs_dfs_is_possible(struct cifs_sb_info *cifs_sb)
-{
-#ifdef CONFIG_CIFS_DFS_UPCALL
-	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
-
-	if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
-		return true;
-#endif
-	return false;
-}
-
 static void
 cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
 {
@@ -159,27 +143,19 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
 	if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
 		fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
 		fattr->cf_dtype = DT_DIR;
-		/*
-		 * Windows CIFS servers generally make DFS referrals look
-		 * like directories in FIND_* responses with the reparse
-		 * attribute flag also set (since DFS junctions are
-		 * reparse points). We must revalidate at least these
-		 * directory inodes before trying to use them (if
-		 * they are DFS we will get PATH_NOT_COVERED back
-		 * when queried directly and can then try to connect
-		 * to the DFS target)
-		 */
-		if (cifs_dfs_is_possible(cifs_sb) &&
-		    (fattr->cf_cifsattrs & ATTR_REPARSE))
-			fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
-	} else if (fattr->cf_cifsattrs & ATTR_REPARSE) {
-		fattr->cf_mode = S_IFLNK;
-		fattr->cf_dtype = DT_LNK;
 	} else {
 		fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
 		fattr->cf_dtype = DT_REG;
 	}
 
+	/*
+	 * We need to revalidate it further to make a decision about whether it
+	 * is a symbolic link, DFS referral or a reparse point with a direct
+	 * access like junctions, deduplicated files, NFS symlinks.
+	 */
+	if (fattr->cf_cifsattrs & ATTR_REPARSE)
+		fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
+
 	/* non-unix readdir doesn't provide nlink */
 	fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
 
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 384cffe42850..5f5ba0dc2ee1 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -534,10 +534,12 @@ cifs_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
 static int
 cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
 		     struct cifs_sb_info *cifs_sb, const char *full_path,
-		     FILE_ALL_INFO *data, bool *adjustTZ)
+		     FILE_ALL_INFO *data, bool *adjustTZ, bool *symlink)
 {
 	int rc;
 
+	*symlink = false;
+
 	/* could do find first instead but this returns more info */
 	rc = CIFSSMBQPathInfo(xid, tcon, full_path, data, 0 /* not legacy */,
 			      cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
@@ -554,6 +556,23 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 		*adjustTZ = true;
 	}
+
+	if (!rc && (le32_to_cpu(data->Attributes) & ATTR_REPARSE)) {
+		int tmprc;
+		int oplock = 0;
+		__u16 netfid;
+
+		/* Need to check if this is a symbolic link or not */
+		tmprc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
+				    FILE_READ_ATTRIBUTES, 0, &netfid, &oplock,
+				    NULL, cifs_sb->local_nls,
+			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (tmprc == -EOPNOTSUPP)
+			*symlink = true;
+		else
+			CIFSSMBClose(xid, tcon, netfid);
+	}
+
 	return rc;
 }
 
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 78ff88c467b9..84c012a6aba0 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -123,12 +123,13 @@ move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src)
 int
 smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
 		     struct cifs_sb_info *cifs_sb, const char *full_path,
-		     FILE_ALL_INFO *data, bool *adjust_tz)
+		     FILE_ALL_INFO *data, bool *adjust_tz, bool *symlink)
 {
 	int rc;
 	struct smb2_file_all_info *smb2_data;
 
 	*adjust_tz = false;
+	*symlink = false;
 
 	smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
 			    GFP_KERNEL);
@@ -136,9 +137,16 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
 		return -ENOMEM;
 
 	rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path,
-				FILE_READ_ATTRIBUTES, FILE_OPEN,
-				OPEN_REPARSE_POINT, smb2_data,
-				SMB2_OP_QUERY_INFO);
+				FILE_READ_ATTRIBUTES, FILE_OPEN, 0,
+				smb2_data, SMB2_OP_QUERY_INFO);
+	if (rc == -EOPNOTSUPP) {
+		*symlink = true;
+		/* Failed on a symbolic link - query a reparse point info */
+		rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path,
+					FILE_READ_ATTRIBUTES, FILE_OPEN,
+					OPEN_REPARSE_POINT, smb2_data,
+					SMB2_OP_QUERY_INFO);
+	}
 	if (rc)
 		goto out;
 
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 313813e4c19b..b4eea105b08c 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -61,7 +61,7 @@ extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst,
 extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
 				struct cifs_sb_info *cifs_sb,
 				const char *full_path, FILE_ALL_INFO *data,
-				bool *adjust_tz);
+				bool *adjust_tz, bool *symlink);
 extern int smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
 			      const char *full_path, __u64 size,
 			      struct cifs_sb_info *cifs_sb, bool set_alloc);
-- 
cgit v1.3


From cce0244ab0af78f8f01ad826d2f9813a9580b105 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 7 Nov 2013 15:02:35 -0500
Subject: cifs: change ERRnomem error mapping from ENOMEM to EREMOTEIO

Sometimes, the server will report an error that basically indicates
that it's running out of resources. These include these under SMB1:

NT_STATUS_NO_MEMORY
NT_STATUS_SECTION_TOO_BIG
NT_STATUS_TOO_MANY_PAGING_FILES

...and this one under SMB2:

STATUS_NO_MEMORY

Currently, this gets mapped to ENOMEM by the client, but that's
confusing as an ENOMEM error is typically an indicator that the
client is out of memory.

Change these errors to instead map to EREMOTEIO to indicate that
the problem is actually server-side and not on the client.

Reported-by: "ISHIKAWA,chiaki" <ishikawa@yk.rim.or.jp>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/netmisc.c      | 2 +-
 fs/cifs/smb2maperror.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 651a5279607b..049884552e76 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -51,7 +51,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = {
 	{ERRnoaccess, -EACCES},
 	{ERRbadfid, -EBADF},
 	{ERRbadmcb, -EIO},
-	{ERRnomem, -ENOMEM},
+	{ERRnomem, -EREMOTEIO},
 	{ERRbadmem, -EFAULT},
 	{ERRbadenv, -EFAULT},
 	{ERRbadformat, -EINVAL},
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 7c2f45c06fc2..94bd4fbb13d3 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -306,7 +306,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
 	{STATUS_NONEXISTENT_SECTOR, -EIO, "STATUS_NONEXISTENT_SECTOR"},
 	{STATUS_MORE_PROCESSING_REQUIRED, -EIO,
 	"STATUS_MORE_PROCESSING_REQUIRED"},
-	{STATUS_NO_MEMORY, -ENOMEM, "STATUS_NO_MEMORY"},
+	{STATUS_NO_MEMORY, -EREMOTEIO, "STATUS_NO_MEMORY"},
 	{STATUS_CONFLICTING_ADDRESSES, -EADDRINUSE,
 	"STATUS_CONFLICTING_ADDRESSES"},
 	{STATUS_NOT_MAPPED_VIEW, -EIO, "STATUS_NOT_MAPPED_VIEW"},
-- 
cgit v1.3


From a8582159edb47a60dd1e5df5a21890f225270f62 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 7 Nov 2013 15:03:39 -0500
Subject: cifs: don't spam the logs on unexpected lookup errors

Andrey reported that he was seeing cifs.ko spam the logs with messages
like this:

    CIFS VFS: Unexpected lookup error -26

He was listing the root directory of a server and hitting an error when
trying to QUERY_PATH_INFO against hiberfil.sys and pagefile.sys. The
right fix would be to switch the lookup code over to using FIND_FIRST,
but until then we really don't need to report this at a level of
KERN_ERR. Convert this message over to FYI level.

Reported-by: "Andrey Shernyukov" <andreysh@nioch.nsc.ru>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 5384c2a640ca..11ff5f116b20 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -756,7 +756,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	/*	if it was once a directory (but how can we tell?) we could do
 		shrink_dcache_parent(direntry); */
 	} else if (rc != -EACCES) {
-		cifs_dbg(VFS, "Unexpected lookup error %d\n", rc);
+		cifs_dbg(FYI, "Unexpected lookup error %d\n", rc);
 		/* We special case check for Access Denied - since that
 		is a common return code */
 	}
-- 
cgit v1.3


From dca692880e887739a669f6c41a80ca68ce2b09fc Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Mon, 11 Nov 2013 16:42:37 -0600
Subject: [CIFS] O_DIRECT opens should work on directio mounts

Opens on current cifs/smb2/smb3 mounts with O_DIRECT flag fail
even when caching is disabled on the mount.  This was
reported by those running SMB2 benchmarks who need to
be able to pass O_DIRECT on many of their open calls to
reduce caching effects, but would also be needed by other
applications.

When mounting with forcedirectio ("cache=none") cifs and smb2/smb3
do not go through the page cache and thus opens with O_DIRECT flag
should work (when posix extensions are negotiated we even are
able to send the flag to the server). This patch fixes that
in a simple way.

The 9P client has a similar situation (caching is often disabled)
and takes the same approach to O_DIRECT support ie works if caching
disabled, but if client caching enabled it fails with EINVAL.

A followon idea for a future patch as Pavel noted, could
be that files opened with O_DIRECT could cause us to change
inode->i_fop on the fly from

cifs_file_strict_ops

to

cifs_file_direct_ops

which would allow us to support this on non-forcedirectio mounts
(cache=strict and cache=loose) as well.

Reviewed-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/file.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 7ddddf2e2504..5a5a87240fe2 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3663,6 +3663,27 @@ void cifs_oplock_break(struct work_struct *work)
 	}
 }
 
+/*
+ * The presence of cifs_direct_io() in the address space ops vector
+ * allowes open() O_DIRECT flags which would have failed otherwise.
+ *
+ * In the non-cached mode (mount with cache=none), we shunt off direct read and write requests
+ * so this method should never be called.
+ *
+ * Direct IO is not yet supported in the cached mode. 
+ */
+static ssize_t
+cifs_direct_io(int rw, struct kiocb *iocb, const struct iovec *iov,
+               loff_t pos, unsigned long nr_segs)
+{
+        /*
+         * FIXME
+         * Eventually need to support direct IO for non forcedirectio mounts
+         */
+        return -EINVAL;
+}
+
+
 const struct address_space_operations cifs_addr_ops = {
 	.readpage = cifs_readpage,
 	.readpages = cifs_readpages,
@@ -3672,6 +3693,7 @@ const struct address_space_operations cifs_addr_ops = {
 	.write_end = cifs_write_end,
 	.set_page_dirty = __set_page_dirty_nobuffers,
 	.releasepage = cifs_release_page,
+	.direct_IO = cifs_direct_io,
 	.invalidatepage = cifs_invalidate_page,
 	.launder_page = cifs_launder_page,
 };
-- 
cgit v1.3


From 2c957ddf30897787e39462ac56cdc4bf21eb0465 Mon Sep 17 00:00:00 2001
From: Tim Gardner <tim.gardner@canonical.com>
Date: Thu, 7 Nov 2013 16:40:57 -0700
Subject: cifs: Use data structures to compute NTLMv2 response offsets

A bit of cleanup plus some gratuitous variable renaming. I think using
structures instead of numeric offsets makes this code much more
understandable.

Also added a comment about current time range expected by
the server.

Acked-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Shirish Pargaonkar <spargaonkar@suse.com>
Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsencrypt.c | 40 ++++++++++++++++++++++++----------------
 fs/cifs/cifspdu.h     |  8 +++++++-
 2 files changed, 31 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index fc6f4f3a1a9d..4934347321d3 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -548,7 +548,13 @@ static int
 CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
 {
 	int rc;
-	unsigned int offset = CIFS_SESS_KEY_SIZE + 8;
+	struct ntlmv2_resp *ntlmv2 = (struct ntlmv2_resp *)
+	    (ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+	unsigned int hash_len;
+
+	/* The MD5 hash starts at challenge_key.key */
+	hash_len = ses->auth_key.len - (CIFS_SESS_KEY_SIZE +
+		offsetof(struct ntlmv2_resp, challenge.key[0]));
 
 	if (!ses->server->secmech.sdeschmacmd5) {
 		cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__);
@@ -556,7 +562,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
 	}
 
 	rc = crypto_shash_setkey(ses->server->secmech.hmacmd5,
-				ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+				 ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
 	if (rc) {
 		cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n",
 			 __func__);
@@ -570,20 +576,21 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
 	}
 
 	if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED)
-		memcpy(ses->auth_key.response + offset,
-			ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+		memcpy(ntlmv2->challenge.key,
+		       ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
 	else
-		memcpy(ses->auth_key.response + offset,
-			ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+		memcpy(ntlmv2->challenge.key,
+		       ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
 	rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
-		ses->auth_key.response + offset, ses->auth_key.len - offset);
+				 ntlmv2->challenge.key, hash_len);
 	if (rc) {
 		cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
 		return rc;
 	}
 
+	/* Note that the MD5 digest over writes anon.challenge_key.key */
 	rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
-		ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+				ntlmv2->ntlmv2_hash);
 	if (rc)
 		cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
 
@@ -627,7 +634,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
 	int rc;
 	int baselen;
 	unsigned int tilen;
-	struct ntlmv2_resp *buf;
+	struct ntlmv2_resp *ntlmv2;
 	char ntlmv2_hash[16];
 	unsigned char *tiblob = NULL; /* target info blob */
 
@@ -660,13 +667,14 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
 	}
 	ses->auth_key.len += baselen;
 
-	buf = (struct ntlmv2_resp *)
+	ntlmv2 = (struct ntlmv2_resp *)
 			(ses->auth_key.response + CIFS_SESS_KEY_SIZE);
-	buf->blob_signature = cpu_to_le32(0x00000101);
-	buf->reserved = 0;
-	buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
-	get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
-	buf->reserved2 = 0;
+	ntlmv2->blob_signature = cpu_to_le32(0x00000101);
+	ntlmv2->reserved = 0;
+	/* Must be within 5 minutes of the server */
+	ntlmv2->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	get_random_bytes(&ntlmv2->client_chal, sizeof(ntlmv2->client_chal));
+	ntlmv2->reserved2 = 0;
 
 	memcpy(ses->auth_key.response + baselen, tiblob, tilen);
 
@@ -706,7 +714,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
 	}
 
 	rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
-		ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+		ntlmv2->ntlmv2_hash,
 		CIFS_HMAC_MD5_HASH_SIZE);
 	if (rc) {
 		cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 9e5ee34de986..33df36ef9d52 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -697,7 +697,13 @@ struct ntlmssp2_name {
 } __attribute__((packed));
 
 struct ntlmv2_resp {
-	char ntlmv2_hash[CIFS_ENCPWD_SIZE];
+	union {
+	    char ntlmv2_hash[CIFS_ENCPWD_SIZE];
+	    struct {
+		__u8 reserved[8];
+		__u8 key[CIFS_SERVER_CHALLENGE_SIZE];
+	    } __attribute__((packed)) challenge;
+	} __attribute__((packed));
 	__le32 blob_signature;
 	__u32  reserved;
 	__le64  time;
-- 
cgit v1.3


From 69e9c6c6dc87587d8846e447febedefe037c14f8 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Thu, 5 Sep 2013 16:58:43 +0200
Subject: Btrfs: eliminate the exceptional root_tree refs=0

The fact that btrfs_root_refs() returned 0 for the tree_root caused
bugs in the past, therefore it is set to 1 with this patch and
(hopefully) all affected code is adapted to this change.

I verified this change by temporarily adding WARN_ON() checks
everywhere where btrfs_root_refs() is used, checking whether the
logic of the code is changed by btrfs_root_refs() returning 1
instead of 0 for root->root_key.objectid == BTRFS_ROOT_TREE_OBJECTID.
With these added checks, I ran the xfstests './check -g auto'.

The two roots chunk_root and log_root_tree that are only referenced
by the superblock and the log_roots below the log_root_tree still
have btrfs_root_refs() == 0, only the tree_root is changed.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/disk-io.c   |  1 +
 fs/btrfs/inode-map.c |  3 +--
 fs/btrfs/inode.c     | 21 ++++++++-------------
 3 files changed, 10 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 62176ad89846..f724397b396b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2670,6 +2670,7 @@ retry_root_backup:
 
 	btrfs_set_root_node(&tree_root->root_item, tree_root->node);
 	tree_root->commit_root = btrfs_root_node(tree_root);
+	btrfs_set_root_refs(&tree_root->root_item, 1);
 
 	location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
 	location.type = BTRFS_ROOT_ITEM_KEY;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 2c66ddbbe670..d11e1c6610b6 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -412,8 +412,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
 		return 0;
 
 	/* Don't save inode cache if we are deleting this root */
-	if (btrfs_root_refs(&root->root_item) == 0 &&
-	    root != root->fs_info->tree_root)
+	if (btrfs_root_refs(&root->root_item) == 0)
 		return 0;
 
 	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 51e3afa78354..dfc60aa40332 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4472,8 +4472,10 @@ void btrfs_evict_inode(struct inode *inode)
 	trace_btrfs_inode_evict(inode);
 
 	truncate_inode_pages(&inode->i_data, 0);
-	if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
-			       btrfs_is_free_space_inode(inode)))
+	if (inode->i_nlink &&
+	    ((btrfs_root_refs(&root->root_item) != 0 &&
+	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
+	     btrfs_is_free_space_inode(inode)))
 		goto no_delete;
 
 	if (is_bad_inode(inode)) {
@@ -4490,7 +4492,8 @@ void btrfs_evict_inode(struct inode *inode)
 	}
 
 	if (inode->i_nlink > 0) {
-		BUG_ON(btrfs_root_refs(&root->root_item) != 0);
+		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
+		       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
 		goto no_delete;
 	}
 
@@ -4731,14 +4734,7 @@ static void inode_tree_del(struct inode *inode)
 	}
 	spin_unlock(&root->inode_lock);
 
-	/*
-	 * Free space cache has inodes in the tree root, but the tree root has a
-	 * root_refs of 0, so this could end up dropping the tree root as a
-	 * snapshot, so we need the extra !root->fs_info->tree_root check to
-	 * make sure we don't drop it.
-	 */
-	if (empty && btrfs_root_refs(&root->root_item) == 0 &&
-	    root != root->fs_info->tree_root) {
+	if (empty && btrfs_root_refs(&root->root_item) == 0) {
 		synchronize_srcu(&root->fs_info->subvol_srcu);
 		spin_lock(&root->inode_lock);
 		empty = RB_EMPTY_ROOT(&root->inode_tree);
@@ -7857,8 +7853,7 @@ int btrfs_drop_inode(struct inode *inode)
 		return 1;
 
 	/* the snap/subvol tree is on deleting */
-	if (btrfs_root_refs(&root->root_item) == 0 &&
-	    root != root->fs_info->tree_root)
+	if (btrfs_root_refs(&root->root_item) == 0)
 		return 1;
 	else
 		return generic_drop_inode(inode);
-- 
cgit v1.3


From f06becc4119856c984e3beef54aa75538e656c6d Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 16 Sep 2013 09:53:28 +0100
Subject: Btrfs: don't store NULL byte in symlink extents

It is not necessary to store the NULL byte in a symlink inline file
extent. There's currently no code that requires the NULL byte to be
present in the extent. This change also doesn't break file format
compatibility nor the send/receive feature.

The VFS also doesn't need the NULL byte to be present in the extent,
as it reads up to inode->i_size bytes (which already excluded the NULL
byte) and sets the NULL byte for us (in fs/namei.c:page_getlink()).

So with this change we save 1 byte per symlink file extent (which is
always inlined in the btree leaf) without losing backward and forward
compatibility.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index dfc60aa40332..59b4e2e495ac 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8339,7 +8339,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	struct btrfs_file_extent_item *ei;
 	struct extent_buffer *leaf;
 
-	name_len = strlen(symname) + 1;
+	name_len = strlen(symname);
 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
 		return -ENAMETOOLONG;
 
@@ -8427,7 +8427,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
 	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 	inode_set_bytes(inode, name_len);
-	btrfs_i_size_write(inode, name_len - 1);
+	btrfs_i_size_write(inode, name_len);
 	err = btrfs_update_inode(trans, root, inode);
 	if (err)
 		drop_inode = 1;
-- 
cgit v1.3


From dd3cc16b8750251ea9b1a843ce7806e82b015d5e Mon Sep 17 00:00:00 2001
From: Ross Kirk <ross.kirk@gmail.com>
Date: Mon, 16 Sep 2013 15:58:09 +0100
Subject: btrfs: drop unused parameter from btrfs_item_nr

Remove unused eb parameter from btrfs_item_nr

Signed-off-by: Ross Kirk <ross.kirk@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/backref.c    |  2 +-
 fs/btrfs/ctree.c      | 34 +++++++++++++++++-----------------
 fs/btrfs/ctree.h      | 13 ++++++-------
 fs/btrfs/dir-item.c   |  2 +-
 fs/btrfs/inode-item.c |  2 +-
 fs/btrfs/inode.c      |  4 ++--
 fs/btrfs/print-tree.c |  2 +-
 fs/btrfs/send.c       |  4 ++--
 8 files changed, 31 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 0552a599b28f..721936a5b2ca 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1619,7 +1619,7 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
 		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
 		btrfs_release_path(path);
 
-		item = btrfs_item_nr(eb, slot);
+		item = btrfs_item_nr(slot);
 		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
 
 		for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 61b5bcd57b7e..c274a752c93d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3337,8 +3337,8 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr)
 	if (!nr)
 		return 0;
 	btrfs_init_map_token(&token);
-	start_item = btrfs_item_nr(l, start);
-	end_item = btrfs_item_nr(l, end);
+	start_item = btrfs_item_nr(start);
+	end_item = btrfs_item_nr(end);
 	data_len = btrfs_token_item_offset(l, start_item, &token) +
 		btrfs_token_item_size(l, start_item, &token);
 	data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
@@ -3406,7 +3406,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 	slot = path->slots[1];
 	i = left_nritems - 1;
 	while (i >= nr) {
-		item = btrfs_item_nr(left, i);
+		item = btrfs_item_nr(i);
 
 		if (!empty && push_items > 0) {
 			if (path->slots[0] > i)
@@ -3470,7 +3470,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 	btrfs_set_header_nritems(right, right_nritems);
 	push_space = BTRFS_LEAF_DATA_SIZE(root);
 	for (i = 0; i < right_nritems; i++) {
-		item = btrfs_item_nr(right, i);
+		item = btrfs_item_nr(i);
 		push_space -= btrfs_token_item_size(right, item, &token);
 		btrfs_set_token_item_offset(right, item, push_space, &token);
 	}
@@ -3612,7 +3612,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 		nr = min(right_nritems - 1, max_slot);
 
 	for (i = 0; i < nr; i++) {
-		item = btrfs_item_nr(right, i);
+		item = btrfs_item_nr(i);
 
 		if (!empty && push_items > 0) {
 			if (path->slots[0] < i)
@@ -3663,7 +3663,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
 		u32 ioff;
 
-		item = btrfs_item_nr(left, i);
+		item = btrfs_item_nr(i);
 
 		ioff = btrfs_token_item_offset(left, item, &token);
 		btrfs_set_token_item_offset(left, item,
@@ -3694,7 +3694,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 	btrfs_set_header_nritems(right, right_nritems);
 	push_space = BTRFS_LEAF_DATA_SIZE(root);
 	for (i = 0; i < right_nritems; i++) {
-		item = btrfs_item_nr(right, i);
+		item = btrfs_item_nr(i);
 
 		push_space = push_space - btrfs_token_item_size(right,
 								item, &token);
@@ -3835,7 +3835,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
 		      btrfs_item_end_nr(l, mid);
 
 	for (i = 0; i < nritems; i++) {
-		struct btrfs_item *item = btrfs_item_nr(right, i);
+		struct btrfs_item *item = btrfs_item_nr(i);
 		u32 ioff;
 
 		ioff = btrfs_token_item_offset(right, item, &token);
@@ -4177,7 +4177,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans,
 
 	btrfs_set_path_blocking(path);
 
-	item = btrfs_item_nr(leaf, path->slots[0]);
+	item = btrfs_item_nr(path->slots[0]);
 	orig_offset = btrfs_item_offset(leaf, item);
 	item_size = btrfs_item_size(leaf, item);
 
@@ -4200,7 +4200,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans,
 	btrfs_cpu_key_to_disk(&disk_key, new_key);
 	btrfs_set_item_key(leaf, &disk_key, slot);
 
-	new_item = btrfs_item_nr(leaf, slot);
+	new_item = btrfs_item_nr(slot);
 
 	btrfs_set_item_offset(leaf, new_item, orig_offset);
 	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
@@ -4339,7 +4339,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
 	/* first correct the data pointers */
 	for (i = slot; i < nritems; i++) {
 		u32 ioff;
-		item = btrfs_item_nr(leaf, i);
+		item = btrfs_item_nr(i);
 
 		ioff = btrfs_token_item_offset(leaf, item, &token);
 		btrfs_set_token_item_offset(leaf, item,
@@ -4387,7 +4387,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
 			fixup_low_keys(root, path, &disk_key, 1);
 	}
 
-	item = btrfs_item_nr(leaf, slot);
+	item = btrfs_item_nr(slot);
 	btrfs_set_item_size(leaf, item, new_size);
 	btrfs_mark_buffer_dirty(leaf);
 
@@ -4441,7 +4441,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
 	/* first correct the data pointers */
 	for (i = slot; i < nritems; i++) {
 		u32 ioff;
-		item = btrfs_item_nr(leaf, i);
+		item = btrfs_item_nr(i);
 
 		ioff = btrfs_token_item_offset(leaf, item, &token);
 		btrfs_set_token_item_offset(leaf, item,
@@ -4455,7 +4455,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
 
 	data_end = old_data;
 	old_size = btrfs_item_size_nr(leaf, slot);
-	item = btrfs_item_nr(leaf, slot);
+	item = btrfs_item_nr(slot);
 	btrfs_set_item_size(leaf, item, old_size + data_size);
 	btrfs_mark_buffer_dirty(leaf);
 
@@ -4514,7 +4514,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 		for (i = slot; i < nritems; i++) {
 			u32 ioff;
 
-			item = btrfs_item_nr(leaf, i);
+			item = btrfs_item_nr( i);
 			ioff = btrfs_token_item_offset(leaf, item, &token);
 			btrfs_set_token_item_offset(leaf, item,
 						    ioff - total_data, &token);
@@ -4535,7 +4535,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 	for (i = 0; i < nr; i++) {
 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
 		btrfs_set_item_key(leaf, &disk_key, slot + i);
-		item = btrfs_item_nr(leaf, slot + i);
+		item = btrfs_item_nr(slot + i);
 		btrfs_set_token_item_offset(leaf, item,
 					    data_end - data_size[i], &token);
 		data_end -= data_size[i];
@@ -4730,7 +4730,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		for (i = slot + nr; i < nritems; i++) {
 			u32 ioff;
 
-			item = btrfs_item_nr(leaf, i);
+			item = btrfs_item_nr(i);
 			ioff = btrfs_token_item_offset(leaf, item, &token);
 			btrfs_set_token_item_offset(leaf, item,
 						    ioff + dsize, &token);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0506f40ede83..fa117f7d5a8f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2461,8 +2461,7 @@ static inline unsigned long btrfs_item_nr_offset(int nr)
 		sizeof(struct btrfs_item) * nr;
 }
 
-static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb,
-					       int nr)
+static inline struct btrfs_item *btrfs_item_nr(int nr)
 {
 	return (struct btrfs_item *)btrfs_item_nr_offset(nr);
 }
@@ -2475,30 +2474,30 @@ static inline u32 btrfs_item_end(struct extent_buffer *eb,
 
 static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
 {
-	return btrfs_item_end(eb, btrfs_item_nr(eb, nr));
+	return btrfs_item_end(eb, btrfs_item_nr(nr));
 }
 
 static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
 {
-	return btrfs_item_offset(eb, btrfs_item_nr(eb, nr));
+	return btrfs_item_offset(eb, btrfs_item_nr(nr));
 }
 
 static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
 {
-	return btrfs_item_size(eb, btrfs_item_nr(eb, nr));
+	return btrfs_item_size(eb, btrfs_item_nr(nr));
 }
 
 static inline void btrfs_item_key(struct extent_buffer *eb,
 			   struct btrfs_disk_key *disk_key, int nr)
 {
-	struct btrfs_item *item = btrfs_item_nr(eb, nr);
+	struct btrfs_item *item = btrfs_item_nr(nr);
 	read_eb_member(eb, item, struct btrfs_item, key, disk_key);
 }
 
 static inline void btrfs_set_item_key(struct extent_buffer *eb,
 			       struct btrfs_disk_key *disk_key, int nr)
 {
-	struct btrfs_item *item = btrfs_item_nr(eb, nr);
+	struct btrfs_item *item = btrfs_item_nr(nr);
 	write_eb_member(eb, item, struct btrfs_item, key, disk_key);
 }
 
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 79e594e341c7..1c529db1697f 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -58,7 +58,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 		return ERR_PTR(ret);
 	WARN_ON(ret > 0);
 	leaf = path->nodes[0];
-	item = btrfs_item_nr(leaf, path->slots[0]);
+	item = btrfs_item_nr(path->slots[0]);
 	ptr = btrfs_item_ptr(leaf, path->slots[0], char);
 	BUG_ON(data_size > btrfs_item_size(leaf, item));
 	ptr += btrfs_item_size(leaf, item) - data_size;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index e0b7034d6343..ec82fae07097 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -369,7 +369,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
 		goto out;
 
 	leaf = path->nodes[0];
-	item = btrfs_item_nr(leaf, path->slots[0]);
+	item = btrfs_item_nr(path->slots[0]);
 	ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
 	ptr += btrfs_item_size(leaf, item) - ins_len;
 	extref = (struct btrfs_inode_extref *)ptr;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 59b4e2e495ac..57b27012d43a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5044,7 +5044,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
 			continue;
 		}
 
-		item = btrfs_item_nr(leaf, slot);
+		item = btrfs_item_nr(slot);
 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
 		if (found_key.objectid != key.objectid)
@@ -5856,7 +5856,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 	compress_type = btrfs_file_extent_compression(leaf, item);
 	max_size = btrfs_file_extent_ram_bytes(leaf, item);
 	inline_size = btrfs_file_extent_inline_item_len(leaf,
-					btrfs_item_nr(leaf, path->slots[0]));
+					btrfs_item_nr(path->slots[0]));
 	tmp = kmalloc(inline_size, GFP_NOFS);
 	if (!tmp)
 		return -ENOMEM;
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 0088bedc8631..417053b17181 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -193,7 +193,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 	btrfs_info(root->fs_info, "leaf %llu total ptrs %d free space %d",
 		   btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l));
 	for (i = 0 ; i < nr ; i++) {
-		item = btrfs_item_nr(l, i);
+		item = btrfs_item_nr(i);
 		btrfs_item_key_to_cpu(l, &key, i);
 		type = btrfs_key_type(&key);
 		printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d "
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e46e0ed74925..0a894399be16 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -791,7 +791,7 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
 	if (found_key->type == BTRFS_INODE_REF_KEY) {
 		ptr = (unsigned long)btrfs_item_ptr(eb, slot,
 						    struct btrfs_inode_ref);
-		item = btrfs_item_nr(eb, slot);
+		item = btrfs_item_nr(slot);
 		total = btrfs_item_size(eb, item);
 		elem_size = sizeof(*iref);
 	} else {
@@ -905,7 +905,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 
 	eb = path->nodes[0];
 	slot = path->slots[0];
-	item = btrfs_item_nr(eb, slot);
+	item = btrfs_item_nr(slot);
 	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
 	cur = 0;
 	len = 0;
-- 
cgit v1.3


From 06ea65a398a2501e94beee3a425d07e1846ff25a Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Thu, 19 Sep 2013 16:07:01 -0400
Subject: Btrfs: add a sanity test for btrfs_split_item

While looking at somebodys corruption I became completely convinced that
btrfs_split_item was broken, so I wrote this test to verify that it was working
as it was supposed to.  Thankfully it appears to be working as intended, so just
add this test to make sure nobody breaks it in the future.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/Makefile                    |   3 +-
 fs/btrfs/ctree.h                     |   4 +-
 fs/btrfs/disk-io.c                   |  38 +++++-
 fs/btrfs/disk-io.h                   |   4 +
 fs/btrfs/super.c                     |   7 +-
 fs/btrfs/tests/btrfs-tests.h         |   5 +
 fs/btrfs/tests/extent-buffer-tests.c | 229 +++++++++++++++++++++++++++++++++++
 7 files changed, 283 insertions(+), 7 deletions(-)
 create mode 100644 fs/btrfs/tests/extent-buffer-tests.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index a91a6a355cc5..4c7dfbfaa3b3 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -14,4 +14,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
 
-btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o
+btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
+	tests/extent-buffer-tests.o
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fa117f7d5a8f..ee0b8aa82828 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1724,7 +1724,9 @@ struct btrfs_root {
 	int ref_cows;
 	int track_dirty;
 	int in_radix;
-
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+	int dummy_root;
+#endif
 	u64 defrag_trans_start;
 	struct btrfs_key defrag_progress;
 	struct btrfs_key defrag_max;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f724397b396b..db2095486a4f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1229,14 +1229,18 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	atomic_set(&root->refs, 1);
 	root->log_transid = 0;
 	root->last_log_commit = 0;
-	extent_io_tree_init(&root->dirty_log_pages,
-			     fs_info->btree_inode->i_mapping);
+	if (fs_info)
+		extent_io_tree_init(&root->dirty_log_pages,
+				     fs_info->btree_inode->i_mapping);
 
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
 	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
 	memset(&root->root_kobj, 0, sizeof(root->root_kobj));
-	root->defrag_trans_start = fs_info->generation;
+	if (fs_info)
+		root->defrag_trans_start = fs_info->generation;
+	else
+		root->defrag_trans_start = 0;
 	init_completion(&root->kobj_unregister);
 	root->defrag_running = 0;
 	root->root_key.objectid = objectid;
@@ -1253,6 +1257,22 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
 	return root;
 }
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+/* Should only be used by the testing infrastructure */
+struct btrfs_root *btrfs_alloc_dummy_root(void)
+{
+	struct btrfs_root *root;
+
+	root = btrfs_alloc_root(NULL);
+	if (!root)
+		return ERR_PTR(-ENOMEM);
+	__setup_root(4096, 4096, 4096, 4096, root, NULL, 1);
+	root->dummy_root = 1;
+
+	return root;
+}
+#endif
+
 struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 				     struct btrfs_fs_info *fs_info,
 				     u64 objectid)
@@ -3670,10 +3690,20 @@ int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
 
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	struct btrfs_root *root;
 	u64 transid = btrfs_header_generation(buf);
 	int was_dirty;
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+	/*
+	 * This is a fast path so only do this check if we have sanity tests
+	 * enabled.  Normal people shouldn't be marking dummy buffers as dirty
+	 * outside of the sanity tests.
+	 */
+	if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &buf->bflags)))
+		return;
+#endif
+	root = BTRFS_I(buf->pages[0]->mapping->host)->root;
 	btrfs_assert_tree_locked(buf);
 	if (transid != root->fs_info->generation)
 		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 5ce2a7da8b11..53059df350f8 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -86,6 +86,10 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
 				 struct btrfs_root *root);
 void btrfs_free_fs_root(struct btrfs_root *root);
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct btrfs_root *btrfs_alloc_dummy_root(void);
+#endif
+
 /*
  * This function is used to grab the root, and avoid it is freed when we
  * access it. But it doesn't ensure that the tree is not dropped.
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e913328d0f2a..0e398657d759 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1789,7 +1789,12 @@ static void btrfs_print_info(void)
 
 static int btrfs_run_sanity_tests(void)
 {
-	return btrfs_test_free_space_cache();
+	int ret;
+
+	ret = btrfs_test_free_space_cache();
+	if (ret)
+		return ret;
+	return btrfs_test_extent_buffer_operations();
 }
 
 static int __init init_btrfs_fs(void)
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 580877625776..04f2cd2ca568 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -24,11 +24,16 @@
 #define test_msg(fmt, ...) pr_info("btrfs: selftest: " fmt, ##__VA_ARGS__)
 
 int btrfs_test_free_space_cache(void);
+int btrfs_test_extent_buffer_operations(void);
 #else
 static inline int btrfs_test_free_space_cache(void)
 {
 	return 0;
 }
+static inline int btrfs_test_extent_buffer_operations(void)
+{
+	return 0;
+}
 #endif
 
 #endif
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
new file mode 100644
index 000000000000..cc286ce97d1e
--- /dev/null
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/slab.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../extent_io.h"
+#include "../disk-io.h"
+
+static int test_btrfs_split_item(void)
+{
+	struct btrfs_path *path;
+	struct btrfs_root *root;
+	struct extent_buffer *eb;
+	struct btrfs_item *item;
+	char *value = "mary had a little lamb";
+	char *split1 = "mary had a little";
+	char *split2 = " lamb";
+	char *split3 = "mary";
+	char *split4 = " had a little";
+	char buf[32];
+	struct btrfs_key key;
+	u32 value_len = strlen(value);
+	int ret = 0;
+
+	test_msg("Running btrfs_split_item tests\n");
+
+	root = btrfs_alloc_dummy_root();
+	if (IS_ERR(root)) {
+		test_msg("Could not allocate root\n");
+		return PTR_ERR(root);
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		test_msg("Could not allocate path\n");
+		kfree(root);
+		return -ENOMEM;
+	}
+
+	path->nodes[0] = eb = alloc_dummy_extent_buffer(0, 4096);
+	if (!eb) {
+		test_msg("Could not allocate dummy buffer\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+	path->slots[0] = 0;
+
+	key.objectid = 0;
+	key.type = BTRFS_EXTENT_CSUM_KEY;
+	key.offset = 0;
+
+	setup_items_for_insert(root, path, &key, &value_len, value_len,
+			       value_len + sizeof(struct btrfs_item), 1);
+	item = btrfs_item_nr(0);
+	write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
+			    value_len);
+
+	key.offset = 3;
+
+	/*
+	 * Passing NULL trans here should be safe because we have plenty of
+	 * space in this leaf to split the item without having to split the
+	 * leaf.
+	 */
+	ret = btrfs_split_item(NULL, root, path, &key, 17);
+	if (ret) {
+		test_msg("Split item failed %d\n", ret);
+		goto out;
+	}
+
+	/*
+	 * Read the first slot, it should have the original key and contain only
+	 * 'mary had a little'
+	 */
+	btrfs_item_key_to_cpu(eb, &key, 0);
+	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+	    key.offset != 0) {
+		test_msg("Invalid key at slot 0\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	item = btrfs_item_nr(0);
+	if (btrfs_item_size(eb, item) != strlen(split1)) {
+		test_msg("Invalid len in the first split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0),
+			   strlen(split1));
+	if (memcmp(buf, split1, strlen(split1))) {
+		test_msg("Data in the buffer doesn't match what it should "
+			 "in the first split have='%.*s' want '%s'\n",
+			 (int)strlen(split1), buf, split1);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	btrfs_item_key_to_cpu(eb, &key, 1);
+	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+	    key.offset != 3) {
+		test_msg("Invalid key at slot 1\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	item = btrfs_item_nr(1);
+	if (btrfs_item_size(eb, item) != strlen(split2)) {
+		test_msg("Invalid len in the second split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1),
+			   strlen(split2));
+	if (memcmp(buf, split2, strlen(split2))) {
+		test_msg("Data in the buffer doesn't match what it should "
+			 "in the second split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	key.offset = 1;
+	/* Do it again so we test memmoving the other items in the leaf */
+	ret = btrfs_split_item(NULL, root, path, &key, 4);
+	if (ret) {
+		test_msg("Second split item failed %d\n", ret);
+		goto out;
+	}
+
+	btrfs_item_key_to_cpu(eb, &key, 0);
+	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+	    key.offset != 0) {
+		test_msg("Invalid key at slot 0\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	item = btrfs_item_nr(0);
+	if (btrfs_item_size(eb, item) != strlen(split3)) {
+		test_msg("Invalid len in the first split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0),
+			   strlen(split3));
+	if (memcmp(buf, split3, strlen(split3))) {
+		test_msg("Data in the buffer doesn't match what it should "
+			 "in the third split");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	btrfs_item_key_to_cpu(eb, &key, 1);
+	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+	    key.offset != 1) {
+		test_msg("Invalid key at slot 1\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	item = btrfs_item_nr(1);
+	if (btrfs_item_size(eb, item) != strlen(split4)) {
+		test_msg("Invalid len in the second split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1),
+			   strlen(split4));
+	if (memcmp(buf, split4, strlen(split4))) {
+		test_msg("Data in the buffer doesn't match what it should "
+			 "in the fourth split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	btrfs_item_key_to_cpu(eb, &key, 2);
+	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+	    key.offset != 3) {
+		test_msg("Invalid key at slot 2\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	item = btrfs_item_nr(2);
+	if (btrfs_item_size(eb, item) != strlen(split2)) {
+		test_msg("Invalid len in the second split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 2),
+			   strlen(split2));
+	if (memcmp(buf, split2, strlen(split2))) {
+		test_msg("Data in the buffer doesn't match what it should "
+			 "in the last chunk\n");
+		ret = -EINVAL;
+		goto out;
+	}
+out:
+	btrfs_free_path(path);
+	kfree(root);
+	return ret;
+}
+
+int btrfs_test_extent_buffer_operations(void)
+{
+	test_msg("Running extent buffer operation tests");
+	return test_btrfs_split_item();
+}
-- 
cgit v1.3


From d4b4087c43cc00a196c5be57fac41f41309f1d56 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Tue, 24 Sep 2013 14:09:34 -0400
Subject: Btrfs: do a full search everytime in btrfs_search_old_slot

While running some snashot aware defrag tests I noticed I was panicing every
once and a while in key_search.  This is because of the optimization that says
if we find a key at slot 0 it will be at slot 0 all the way down the rest of the
tree.  This isn't the case for btrfs_search_old_slot since it will likely replay
changes to a buffer if something has changed since we took our sequence number.
So short circuit this optimization by setting prev_cmp to -1 every time we call
key_search so we will do our normal binary search.  With this patch I am no
longer seeing the panics I was seeing before.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ctree.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c274a752c93d..6932686b61cf 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2758,7 +2758,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
 	int level;
 	int lowest_unlock = 1;
 	u8 lowest_level = 0;
-	int prev_cmp;
+	int prev_cmp = -1;
 
 	lowest_level = p->lowest_level;
 	WARN_ON(p->nodes[0] != NULL);
@@ -2769,7 +2769,6 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
 	}
 
 again:
-	prev_cmp = -1;
 	b = get_old_root(root, time_seq);
 	level = btrfs_header_level(b);
 	p->locks[level] = BTRFS_READ_LOCK;
@@ -2787,6 +2786,11 @@ again:
 		 */
 		btrfs_unlock_up_safe(p, level + 1);
 
+		/*
+		 * Since we can unwind eb's we want to do a real search every
+		 * time.
+		 */
+		prev_cmp = -1;
 		ret = key_search(b, key, level, &prev_cmp, &slot);
 
 		if (level != 0) {
-- 
cgit v1.3


From 53645a91f4baed059ec5dfb22340cf3e6b8fdd2c Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Fri, 20 Sep 2013 14:43:28 +0100
Subject: Btrfs: remove duplicated ino cache's inode lookup

We're doing a unnecessary extra lookup of the ino cache's
inode when we already have it (and holding a reference)
during the process of saving the ino cache contents to disk.
Therefore remove this extra lookup.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/free-space-cache.c | 9 ++-------
 fs/btrfs/free-space-cache.h | 3 ++-
 fs/btrfs/inode-map.c        | 2 +-
 3 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index b4f9904c4c6b..9a1e371a6eba 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2967,19 +2967,15 @@ out:
 
 int btrfs_write_out_ino_cache(struct btrfs_root *root,
 			      struct btrfs_trans_handle *trans,
-			      struct btrfs_path *path)
+			      struct btrfs_path *path,
+			      struct inode *inode)
 {
 	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
-	struct inode *inode;
 	int ret;
 
 	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
 		return 0;
 
-	inode = lookup_free_ino_inode(root, path);
-	if (IS_ERR(inode))
-		return 0;
-
 	ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
 	if (ret) {
 		btrfs_delalloc_release_metadata(inode, inode->i_size);
@@ -2990,7 +2986,6 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
 #endif
 	}
 
-	iput(inode);
 	return ret;
 }
 
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index e737f92cf6d0..ba8d5a1b64b5 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -76,7 +76,8 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info,
 			struct btrfs_root *root);
 int btrfs_write_out_ino_cache(struct btrfs_root *root,
 			      struct btrfs_trans_handle *trans,
-			      struct btrfs_path *path);
+			      struct btrfs_path *path,
+			      struct inode *inode);
 
 void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group);
 int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d11e1c6610b6..b863e45d373b 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -503,7 +503,7 @@ again:
 	}
 	btrfs_free_reserved_data_space(inode, prealloc);
 
-	ret = btrfs_write_out_ino_cache(root, trans, path);
+	ret = btrfs_write_out_ino_cache(root, trans, path, inode);
 out_put:
 	iput(inode);
 out_release:
-- 
cgit v1.3


From 74514323947ef27347564bfd7a663fdb3429cb20 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Fri, 20 Sep 2013 14:46:51 +0100
Subject: Btrfs: remove path arg from btrfs_truncate_free_space_cache

Not used for anything, and removing it avoids caller's need to
allocate a path structure.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c      |  3 +--
 fs/btrfs/free-space-cache.c |  1 -
 fs/btrfs/free-space-cache.h |  1 -
 fs/btrfs/inode-map.c        |  2 +-
 fs/btrfs/relocation.c       | 11 +----------
 5 files changed, 3 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d58bef130a41..55b537fcb81a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3197,8 +3197,7 @@ again:
 		if (ret)
 			goto out_put;
 
-		ret = btrfs_truncate_free_space_cache(root, trans, path,
-						      inode);
+		ret = btrfs_truncate_free_space_cache(root, trans, inode);
 		if (ret)
 			goto out_put;
 	}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 9a1e371a6eba..4772f3a8e7e5 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -218,7 +218,6 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
 
 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 				    struct btrfs_trans_handle *trans,
-				    struct btrfs_path *path,
 				    struct inode *inode)
 {
 	int ret = 0;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index ba8d5a1b64b5..0cf4977ef70d 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -58,7 +58,6 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
 				       struct btrfs_block_rsv *rsv);
 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 				    struct btrfs_trans_handle *trans,
-				    struct btrfs_path *path,
 				    struct inode *inode);
 int load_free_space_cache(struct btrfs_fs_info *fs_info,
 			  struct btrfs_block_group_cache *block_group);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b863e45d373b..014de497e888 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -466,7 +466,7 @@ again:
 	}
 
 	if (i_size_read(inode) > 0) {
-		ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
+		ret = btrfs_truncate_free_space_cache(root, trans, inode);
 		if (ret) {
 			if (ret != -ENOSPC)
 				btrfs_abort_transaction(trans, root, ret);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4a355726151e..dec4f5aead0f 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3407,7 +3407,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
 				    struct inode *inode, u64 ino)
 {
 	struct btrfs_key key;
-	struct btrfs_path *path;
 	struct btrfs_root *root = fs_info->tree_root;
 	struct btrfs_trans_handle *trans;
 	int ret = 0;
@@ -3432,22 +3431,14 @@ truncate:
 	if (ret)
 		goto out;
 
-	path = btrfs_alloc_path();
-	if (!path) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
 	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans)) {
-		btrfs_free_path(path);
 		ret = PTR_ERR(trans);
 		goto out;
 	}
 
-	ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
+	ret = btrfs_truncate_free_space_cache(root, trans, inode);
 
-	btrfs_free_path(path);
 	btrfs_end_transaction(trans, root);
 	btrfs_btree_balance_dirty(root);
 out:
-- 
cgit v1.3


From fe09e16cc8d444ecc52f6f9a651946f16fffa4e1 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Sun, 22 Sep 2013 12:54:23 +0800
Subject: Btrfs: export btrfs space shared info to userspace

Similar to ocfs2, btrfs also supports that extents can be shared by
different inodes, and there are some userspace tools requesting
for this kind of 'space shared infomation'.[1]

ocfs2 uses flag FIEMAP_EXTENT_SHARED, so does btrfs.

[1]: http://thr3ads.net/ocfs2-devel/2010/09/489052-PATCH-3-3-shared-du-using-fiemap-to-figure-up-the-shared-extents-per-file-and-the-footprint-in

Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent_io.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 51731b76900d..2bf6f46fae44 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -20,6 +20,7 @@
 #include "check-integrity.h"
 #include "locking.h"
 #include "rcu-string.h"
+#include "backref.h"
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -4062,6 +4063,19 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
 	return NULL;
 }
 
+static noinline int count_ext_ref(u64 inum, u64 offset, u64 root_id, void *ctx)
+{
+	unsigned long cnt = *((unsigned long *)ctx);
+
+	cnt++;
+	*((unsigned long *)ctx) = cnt;
+
+	/* Now we're sure that the extent is shared. */
+	if (cnt > 1)
+		return 1;
+	return 0;
+}
+
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len, get_extent_t *get_extent)
 {
@@ -4128,7 +4142,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		last = found_key.offset;
 		last_for_get_extent = last + 1;
 	}
-	btrfs_free_path(path);
+	btrfs_release_path(path);
 
 	/*
 	 * we might have some extents allocated but more delalloc past those
@@ -4198,7 +4212,24 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			flags |= (FIEMAP_EXTENT_DELALLOC |
 				  FIEMAP_EXTENT_UNKNOWN);
 		} else {
+			unsigned long ref_cnt = 0;
+
 			disko = em->block_start + offset_in_extent;
+
+			/*
+			 * As btrfs supports shared space, this information
+			 * can be exported to userspace tools via
+			 * flag FIEMAP_EXTENT_SHARED.
+			 */
+			ret = iterate_inodes_from_logical(
+					em->block_start,
+					BTRFS_I(inode)->root->fs_info,
+					path, count_ext_ref, &ref_cnt);
+			if (ret < 0 && ret != -ENOENT)
+				goto out_free;
+
+			if (ref_cnt > 1)
+				flags |= FIEMAP_EXTENT_SHARED;
 		}
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
 			flags |= FIEMAP_EXTENT_ENCODED;
@@ -4230,6 +4261,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 out_free:
 	free_extent_map(em);
 out:
+	btrfs_free_path(path);
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
 			     &cached_state, GFP_NOFS);
 	return ret;
-- 
cgit v1.3


From 703c88e035242202e3ab48fcbbbe0a7bc62fb7bb Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Sun, 22 Sep 2013 21:54:55 +0100
Subject: Btrfs: fix tracking of orphan inode count

In inode.c:btrfs_orphan_add() if we failed to insert the orphan
item, we would return without decrementing the orphan count that
we just incremented before attempting the insertion, leaving the
orphan inode count wrong.

In inode.c:btrfs_orphan_del(), we were decrementing the inode
orphan count if the bit BTRFS_INODE_ORPHAN_META_RESERVED was set,
which is logically wrong because it should be decremented if the
bit BTRFS_INODE_HAS_ORPHAN_ITEM was set - after all we increment
the count when we set the bit BTRFS_INODE_HAS_ORPHAN_ITEM elsewhere.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/inode.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 57b27012d43a..939dd1ff34a7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2969,6 +2969,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 	if (insert >= 1) {
 		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
 		if (ret) {
+			atomic_dec(&root->orphan_inodes);
 			if (reserve) {
 				clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
 					  &BTRFS_I(inode)->runtime_flags);
@@ -3018,14 +3019,16 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
 		release_rsv = 1;
 	spin_unlock(&root->orphan_lock);
 
-	if (trans && delete_item)
-		ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
-
-	if (release_rsv) {
-		btrfs_orphan_release_metadata(inode);
+	if (delete_item) {
 		atomic_dec(&root->orphan_inodes);
+		if (trans)
+			ret = btrfs_del_orphan_item(trans, root,
+						    btrfs_ino(inode));
 	}
 
+	if (release_rsv)
+		btrfs_orphan_release_metadata(inode);
+
 	return ret;
 }
 
-- 
cgit v1.3


From 9b1998598625fb5b798e8291cafda1a8ec17c1bd Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 23 Sep 2013 11:35:11 +0100
Subject: Btrfs: fix sync fs to actually wait for all data to be persisted

Currently the fs sync function (super.c:btrfs_sync_fs()) doesn't
wait for delayed work to finish before returning success to the
caller. This change fixes this, ensuring that there's no data loss
if a power failure happens right after fs sync returns success to
the caller and before the next commit happens.

Steps to reproduce the data loss issue:

$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ perl -e '$d = ("\x41" x 6001); open($f,">","/mnt/btrfs/foobar"); print $f $d; close($f);' && btrfs fi sync /mnt/btrfs

Right after the btrfs fi sync command (a second or 2 for example), power
off the machine and reboot it. The file will be empty, as it can be verified
after mounting the filesystem and through btrfs-debug-tree:

$ btrfs-debug-tree /dev/sdb3 | egrep '\(257 INODE_ITEM 0\) itemoff' -B 3 -A 8
        item 3 key (256 DIR_INDEX 2) itemoff 3751 itemsize 36
                location key (257 INODE_ITEM 0) type FILE
                namelen 6 datalen 0 name: foobar
        item 4 key (257 INODE_ITEM 0) itemoff 3591 itemsize 160
                inode generation 7 transid 7 size 0 block group 0 mode 100644 links 1
        item 5 key (257 INODE_REF 256) itemoff 3575 itemsize 16
                inode ref index 2 namelen 6 name: foobar
checksum tree key (CSUM_TREE ROOT_ITEM 0)
leaf 29429760 items 0 free space 3995 generation 7 owner 7
fs uuid 6192815c-af2a-4b75-b3db-a959ffb6166e
chunk uuid b529c44b-938c-4d3d-910a-013b4700bcae
uuid tree key (UUID_TREE ROOT_ITEM 0)

After this patch, the data loss no longer happens after a power failure and
btrfs-debug-tree shows:

$ btrfs-debug-tree /dev/sdb3 | egrep '\(257 INODE_ITEM 0\) itemoff' -B 3 -A 8
	item 3 key (256 DIR_INDEX 2) itemoff 3751 itemsize 36
		location key (257 INODE_ITEM 0) type FILE
		namelen 6 datalen 0 name: foobar
	item 4 key (257 INODE_ITEM 0) itemoff 3591 itemsize 160
		inode generation 6 transid 6 size 6001 block group 0 mode 100644 links 1
	item 5 key (257 INODE_REF 256) itemoff 3575 itemsize 16
		inode ref index 2 namelen 6 name: foobar
	item 6 key (257 EXTENT_DATA 0) itemoff 3522 itemsize 53
		extent data disk byte 12845056 nr 8192
		extent data offset 0 nr 8192 ram 8192
		extent compression 0
checksum tree key (CSUM_TREE ROOT_ITEM 0)

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Reviewed-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ioctl.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9d46f60cb943..385c58f2bba1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4557,9 +4557,15 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_logical_to_ino(root, argp);
 	case BTRFS_IOC_SPACE_INFO:
 		return btrfs_ioctl_space_info(root, argp);
-	case BTRFS_IOC_SYNC:
-		btrfs_sync_fs(file->f_dentry->d_sb, 1);
-		return 0;
+	case BTRFS_IOC_SYNC: {
+		int ret;
+
+		ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0);
+		if (ret)
+			return ret;
+		ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
+		return ret;
+	}
 	case BTRFS_IOC_START_SYNC:
 		return btrfs_ioctl_start_sync(root, argp);
 	case BTRFS_IOC_WAIT_SYNC:
-- 
cgit v1.3


From e84cc14213e2c81ae5a2da341a9da0d58a1dbfad Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 9 Sep 2013 19:49:43 +0100
Subject: Btrfs: don't leak block group on error

In extent-tree.c:btrfs_write_dirty_block_groups(), if the call to
write_one_cache_group() failed, we would return without putting
the block group first.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 55b537fcb81a..54ee542c705e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3317,10 +3317,9 @@ again:
 		last = cache->key.objectid + cache->key.offset;
 
 		err = write_one_cache_group(trans, root, path, cache);
+		btrfs_put_block_group(cache);
 		if (err) /* File system offline */
 			goto out;
-
-		btrfs_put_block_group(cache);
 	}
 
 	while (1) {
-- 
cgit v1.3


From 4577b014d1bc3db386da3246f625888fc48083a9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 27 Sep 2013 09:33:09 -0400
Subject: Btrfs: relocate csums properly with prealloc extents

A user reported a problem where they were getting csum errors when running a
balance and running systemd's journal.  This is because systemd is awesome and
fallocate()'s its log space and writes into it.  Unfortunately we assume that
when we read in all the csums for an extent that they are sequential starting at
the bytenr we care about.  This obviously isn't the case for prealloc extents,
where we could have written to the middle of the prealloc extent only, which
means the csum would be for the bytenr in the middle of our range and not the
front of our range.  Fix this by offsetting the new bytenr we are logging to
based on the original bytenr the csum was for.  With this patch I no longer see
the csum errors I was seeing.  Thanks,

Cc: stable@vger.kernel.org
Reported-by: Chris Murphy <lists@colorremedies.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/relocation.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index dec4f5aead0f..0359eecac95d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4472,6 +4472,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
 	u64 disk_bytenr;
+	u64 new_bytenr;
 	LIST_HEAD(list);
 
 	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
@@ -4483,13 +4484,24 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
 	if (ret)
 		goto out;
 
-	disk_bytenr = ordered->start;
 	while (!list_empty(&list)) {
 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
 		list_del_init(&sums->list);
 
-		sums->bytenr = disk_bytenr;
-		disk_bytenr += sums->len;
+		/*
+		 * We need to offset the new_bytenr based on where the csum is.
+		 * We need to do this because we will read in entire prealloc
+		 * extents but we may have written to say the middle of the
+		 * prealloc extent, so we need to make sure the csum goes with
+		 * the right disk offset.
+		 *
+		 * We can do this because the data reloc inode refers strictly
+		 * to the on disk bytes, so we don't have to worry about
+		 * disk_len vs real len like with real inodes since it's all
+		 * disk length.
+		 */
+		new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
+		sums->bytenr = new_bytenr;
 
 		btrfs_add_ordered_sum(inode, ordered, sums);
 	}
-- 
cgit v1.3


From e0228285a8cad70e4b7b4833cc650e36ecd8de89 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 20 Sep 2013 22:26:29 -0400
Subject: Btrfs: reset intwrite on transaction abort

If we abort a transaction in the middle of a commit we weren't undoing the
intwrite locking.  This patch fixes that problem.

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/transaction.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8c81bdc1ef9b..7138d6a3726e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1552,6 +1552,8 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
 		root->fs_info->running_transaction = NULL;
 	spin_unlock(&root->fs_info->trans_lock);
 
+	if (trans->type & __TRANS_FREEZABLE)
+		sb_end_intwrite(root->fs_info->sb);
 	put_transaction(cur_trans);
 	put_transaction(cur_trans);
 
-- 
cgit v1.3


From b6d08f0630d51ec09d67f16f6d7839699bbc0402 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 27 Sep 2013 14:57:43 -0400
Subject: Btrfs: do not release metadata for space cache inodes

I've been testing our error paths and I was tripping the BUG_ON() in
drop_outstanding_extent because our outstanding_extents is 0 for space cache
inodes.  This is because we don't reserve metadata space for these inodes since
we depend on the global block reserve for our space.  To fix this we need to
make sure the DO_ACCOUNTING stuff doesn't actually call release_metadata for
space cache inodes.  With this patch I'm no longer panicing.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/inode.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 939dd1ff34a7..c0dcb4c3d72d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1551,7 +1551,13 @@ static void btrfs_clear_bit_hook(struct inode *inode,
 			spin_unlock(&BTRFS_I(inode)->lock);
 		}
 
-		if (*bits & EXTENT_DO_ACCOUNTING)
+		/*
+		 * We don't reserve metadata space for space cache inodes so we
+		 * don't need to call dellalloc_release_metadata if there is an
+		 * error.
+		 */
+		if (*bits & EXTENT_DO_ACCOUNTING &&
+		    root != root->fs_info->tree_root)
 			btrfs_delalloc_release_metadata(inode, len);
 
 		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
-- 
cgit v1.3


From 4e121c06adf53aae478ebce3035116595d063413 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 27 Sep 2013 16:32:39 -0400
Subject: Btrfs: cleanup transaction on abort

If we abort not during a transaction commit we won't clean up anything until we
unmount.  Unfortunately if we abort in the middle of writing out an ordered
extent we won't clean it up and if somebody is waiting on that ordered extent
they will wait forever.  To fix this just make the transaction kthread call the
cleanup transaction stuff if it notices theres an error, and make
btrfs_end_transaction wake up the transaction kthread if there is an error.
Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/disk-io.c     | 3 +++
 fs/btrfs/transaction.c | 4 +++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index db2095486a4f..113cd43530ba 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1800,6 +1800,9 @@ sleep:
 		wake_up_process(root->fs_info->cleaner_kthread);
 		mutex_unlock(&root->fs_info->transaction_kthread_mutex);
 
+		if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
+				      &root->fs_info->fs_state)))
+			btrfs_cleanup_transaction(root);
 		if (!try_to_freeze()) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			if (!kthread_should_stop() &&
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7138d6a3726e..a1343e81c59d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -744,8 +744,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		btrfs_run_delayed_iputs(root);
 
 	if (trans->aborted ||
-	    test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+	    test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
+		wake_up_process(info->transaction_kthread);
 		err = -EIO;
+	}
 	assert_qgroups_uptodate(trans);
 
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
-- 
cgit v1.3


From 1de2cfde93c20a0357ff1dffed901598470facf3 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 27 Sep 2013 16:36:02 -0400
Subject: Btrfs: don't delete ordered roots from list during cleanup

During transaction cleanup after an abort we are just removing roots from the
ordered roots list which is incorrect.  We have a BUG_ON() to make sure that the
root is still part of the ordered roots list when we put our ordered extent
which we were tripping in this case.  So do like we do everywhere else and just
move it to the tail of the ordered roots list and allow the normal cleanup to
take care of stuff.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/disk-io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 113cd43530ba..01a26e2eb310 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3836,7 +3836,8 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
 	while (!list_empty(&splice)) {
 		root = list_first_entry(&splice, struct btrfs_root,
 					ordered_root);
-		list_del_init(&root->ordered_root);
+		list_move_tail(&root->ordered_root,
+			       &fs_info->ordered_roots);
 
 		btrfs_destroy_ordered_extents(root);
 
-- 
cgit v1.3


From c16ce1901431629fbe5b9387cc966d62a089e4df Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 27 Sep 2013 16:38:20 -0400
Subject: Btrfs: remove all BUG_ON()'s from commit_cowonly_roots

Noticed this when forcing errors to happen during delayed ref running.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/transaction.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a1343e81c59d..f08e22885c21 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -950,16 +950,19 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 		return ret;
 
 	ret = btrfs_run_dev_stats(trans, root->fs_info);
-	WARN_ON(ret);
+	if (ret)
+		return ret;
 	ret = btrfs_run_dev_replace(trans, root->fs_info);
-	WARN_ON(ret);
-
+	if (ret)
+		return ret;
 	ret = btrfs_run_qgroups(trans, root->fs_info);
-	BUG_ON(ret);
+	if (ret)
+		return ret;
 
 	/* run_qgroups might have added some more refs */
 	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-	BUG_ON(ret);
+	if (ret)
+		return ret;
 
 	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
 		next = fs_info->dirty_cowonly_roots.next;
-- 
cgit v1.3


From 724e2315db3d59a8201d4a87c7c7a873e60e1ce0 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 30 Sep 2013 11:36:38 -0400
Subject: Btrfs: fix two use-after-free bugs with transaction cleanup

I was noticing the slab redzone stuff going off every once and a while during
transaction aborts.  This was caused by two things

1) We would walk the pending snapshots and set their error to -ECANCELED.  We
don't need to do this, the snapshot stuff waits for a transaction commit and if
there is a problem we just free our pending snapshot object and exit.  Doing
this was causing us to touch the pending snapshot object after the thing had
already been freed.

2) We were freeing the transaction manually with wanton disregard for it's
use_count reference counter.  To fix this I cleaned up the transaction freeing
loop to either wait for the transaction commit to finish if it was in the middle
of that (since it will be cleaned and freed up there) or to do the cleanup
oursevles.

I also moved the global "kill all things dirty everywhere" stuff outside of the
transaction cleanup loop since that only needs to be done once.  With this patch
I'm no longer seeing slab corruption because of use after frees.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/disk-io.c     | 111 ++++++++++++++++++-------------------------------
 fs/btrfs/transaction.c |  22 +++++-----
 fs/btrfs/transaction.h |   1 +
 3 files changed, 52 insertions(+), 82 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 01a26e2eb310..b131d716d5a5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -64,7 +64,6 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 				      struct btrfs_root *root);
-static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t);
 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
 static int btrfs_destroy_marked_extents(struct btrfs_root *root,
 					struct extent_io_tree *dirty_pages,
@@ -3915,24 +3914,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 	return ret;
 }
 
-static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t)
-{
-	struct btrfs_pending_snapshot *snapshot;
-	struct list_head splice;
-
-	INIT_LIST_HEAD(&splice);
-
-	list_splice_init(&t->pending_snapshots, &splice);
-
-	while (!list_empty(&splice)) {
-		snapshot = list_entry(splice.next,
-				      struct btrfs_pending_snapshot,
-				      list);
-		snapshot->error = -ECANCELED;
-		list_del_init(&snapshot->list);
-	}
-}
-
 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
 {
 	struct btrfs_inode *btrfs_inode;
@@ -4062,6 +4043,8 @@ again:
 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 				   struct btrfs_root *root)
 {
+	btrfs_destroy_ordered_operations(cur_trans, root);
+
 	btrfs_destroy_delayed_refs(cur_trans, root);
 	btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
 				cur_trans->dirty_pages.dirty_bytes);
@@ -4069,8 +4052,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 	cur_trans->state = TRANS_STATE_COMMIT_START;
 	wake_up(&root->fs_info->transaction_blocked_wait);
 
-	btrfs_evict_pending_snapshots(cur_trans);
-
 	cur_trans->state = TRANS_STATE_UNBLOCKED;
 	wake_up(&root->fs_info->transaction_wait);
 
@@ -4094,63 +4075,51 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 static int btrfs_cleanup_transaction(struct btrfs_root *root)
 {
 	struct btrfs_transaction *t;
-	LIST_HEAD(list);
 
 	mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
 	spin_lock(&root->fs_info->trans_lock);
-	list_splice_init(&root->fs_info->trans_list, &list);
-	root->fs_info->running_transaction = NULL;
-	spin_unlock(&root->fs_info->trans_lock);
-
-	while (!list_empty(&list)) {
-		t = list_entry(list.next, struct btrfs_transaction, list);
-
-		btrfs_destroy_ordered_operations(t, root);
-
-		btrfs_destroy_all_ordered_extents(root->fs_info);
-
-		btrfs_destroy_delayed_refs(t, root);
-
-		/*
-		 *  FIXME: cleanup wait for commit
-		 *  We needn't acquire the lock here, because we are during
-		 *  the umount, there is no other task which will change it.
-		 */
-		t->state = TRANS_STATE_COMMIT_START;
-		smp_mb();
-		if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
-			wake_up(&root->fs_info->transaction_blocked_wait);
-
-		btrfs_evict_pending_snapshots(t);
-
-		t->state = TRANS_STATE_UNBLOCKED;
-		smp_mb();
-		if (waitqueue_active(&root->fs_info->transaction_wait))
-			wake_up(&root->fs_info->transaction_wait);
-
-		btrfs_destroy_delayed_inodes(root);
-		btrfs_assert_delayed_root_empty(root);
-
-		btrfs_destroy_all_delalloc_inodes(root->fs_info);
-
-		btrfs_destroy_marked_extents(root, &t->dirty_pages,
-					     EXTENT_DIRTY);
-
-		btrfs_destroy_pinned_extent(root,
-					    root->fs_info->pinned_extents);
-
-		t->state = TRANS_STATE_COMPLETED;
-		smp_mb();
-		if (waitqueue_active(&t->commit_wait))
-			wake_up(&t->commit_wait);
+	while (!list_empty(&root->fs_info->trans_list)) {
+		t = list_first_entry(&root->fs_info->trans_list,
+				     struct btrfs_transaction, list);
+		if (t->state >= TRANS_STATE_COMMIT_START) {
+			atomic_inc(&t->use_count);
+			spin_unlock(&root->fs_info->trans_lock);
+			btrfs_wait_for_commit(root, t->transid);
+			btrfs_put_transaction(t);
+			spin_lock(&root->fs_info->trans_lock);
+			continue;
+		}
+		if (t == root->fs_info->running_transaction) {
+			t->state = TRANS_STATE_COMMIT_DOING;
+			spin_unlock(&root->fs_info->trans_lock);
+			/*
+			 * We wait for 0 num_writers since we don't hold a trans
+			 * handle open currently for this transaction.
+			 */
+			wait_event(t->writer_wait,
+				   atomic_read(&t->num_writers) == 0);
+		} else {
+			spin_unlock(&root->fs_info->trans_lock);
+		}
+		btrfs_cleanup_one_transaction(t, root);
 
-		atomic_set(&t->use_count, 0);
+		spin_lock(&root->fs_info->trans_lock);
+		if (t == root->fs_info->running_transaction)
+			root->fs_info->running_transaction = NULL;
 		list_del_init(&t->list);
-		memset(t, 0, sizeof(*t));
-		kmem_cache_free(btrfs_transaction_cachep, t);
-	}
+		spin_unlock(&root->fs_info->trans_lock);
 
+		btrfs_put_transaction(t);
+		trace_btrfs_transaction_commit(root);
+		spin_lock(&root->fs_info->trans_lock);
+	}
+	spin_unlock(&root->fs_info->trans_lock);
+	btrfs_destroy_all_ordered_extents(root->fs_info);
+	btrfs_destroy_delayed_inodes(root);
+	btrfs_assert_delayed_root_empty(root);
+	btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents);
+	btrfs_destroy_all_delalloc_inodes(root->fs_info);
 	mutex_unlock(&root->fs_info->transaction_kthread_mutex);
 
 	return 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f08e22885c21..134039fd59bb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -57,7 +57,7 @@ static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
 					   __TRANS_JOIN_NOLOCK),
 };
 
-static void put_transaction(struct btrfs_transaction *transaction)
+void btrfs_put_transaction(struct btrfs_transaction *transaction)
 {
 	WARN_ON(atomic_read(&transaction->use_count) == 0);
 	if (atomic_dec_and_test(&transaction->use_count)) {
@@ -332,7 +332,7 @@ static void wait_current_trans(struct btrfs_root *root)
 		wait_event(root->fs_info->transaction_wait,
 			   cur_trans->state >= TRANS_STATE_UNBLOCKED ||
 			   cur_trans->aborted);
-		put_transaction(cur_trans);
+		btrfs_put_transaction(cur_trans);
 	} else {
 		spin_unlock(&root->fs_info->trans_lock);
 	}
@@ -610,7 +610,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
 	}
 
 	wait_for_commit(root, cur_trans);
-	put_transaction(cur_trans);
+	btrfs_put_transaction(cur_trans);
 out:
 	return ret;
 }
@@ -735,7 +735,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	smp_mb();
 	if (waitqueue_active(&cur_trans->writer_wait))
 		wake_up(&cur_trans->writer_wait);
-	put_transaction(cur_trans);
+	btrfs_put_transaction(cur_trans);
 
 	if (current->journal_info == trans)
 		current->journal_info = NULL;
@@ -1515,7 +1515,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
 	if (current->journal_info == trans)
 		current->journal_info = NULL;
 
-	put_transaction(cur_trans);
+	btrfs_put_transaction(cur_trans);
 	return 0;
 }
 
@@ -1559,8 +1559,8 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
 
 	if (trans->type & __TRANS_FREEZABLE)
 		sb_end_intwrite(root->fs_info->sb);
-	put_transaction(cur_trans);
-	put_transaction(cur_trans);
+	btrfs_put_transaction(cur_trans);
+	btrfs_put_transaction(cur_trans);
 
 	trace_btrfs_transaction_commit(root);
 
@@ -1676,7 +1676,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 		wait_for_commit(root, cur_trans);
 
-		put_transaction(cur_trans);
+		btrfs_put_transaction(cur_trans);
 
 		return ret;
 	}
@@ -1693,7 +1693,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 			wait_for_commit(root, prev_trans);
 
-			put_transaction(prev_trans);
+			btrfs_put_transaction(prev_trans);
 		} else {
 			spin_unlock(&root->fs_info->trans_lock);
 		}
@@ -1892,8 +1892,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	list_del_init(&cur_trans->list);
 	spin_unlock(&root->fs_info->trans_lock);
 
-	put_transaction(cur_trans);
-	put_transaction(cur_trans);
+	btrfs_put_transaction(cur_trans);
+	btrfs_put_transaction(cur_trans);
 
 	if (trans->type & __TRANS_FREEZABLE)
 		sb_end_intwrite(root->fs_info->sb);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 5c2af8491621..306f88ae1de3 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -166,4 +166,5 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
 				struct extent_io_tree *dirty_pages, int mark);
 int btrfs_transaction_blocked(struct btrfs_fs_info *info);
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
+void btrfs_put_transaction(struct btrfs_transaction *transaction);
 #endif
-- 
cgit v1.3


From 0a4e558609dd4df30a58a07d9eb14c5ddc2c1241 Mon Sep 17 00:00:00 2001
From: Ross Kirk <ross.kirk@gmail.com>
Date: Tue, 24 Sep 2013 10:12:38 +0100
Subject: btrfs: remove unused parameter from btrfs_header_fsid

Remove unused parameter, 'eb'. Unused since introduction in
5f39d397dfbe140a14edecd4e73c34ce23c4f9ee

Updated to be rebased against current upstream and correct diff supplied this time!

Signed-off-by: Ross Kirk <ross.kirk@gmail.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ctree.c   | 10 +++++-----
 fs/btrfs/ctree.h   |  2 +-
 fs/btrfs/disk-io.c |  6 +++---
 fs/btrfs/ioctl.c   |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6932686b61cf..6c5239b55495 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -274,7 +274,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	else
 		btrfs_set_header_owner(cow, new_root_objectid);
 
-	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow),
+	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
 			    BTRFS_FSID_SIZE);
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
@@ -996,7 +996,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	else
 		btrfs_set_header_owner(cow, root->root_key.objectid);
 
-	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow),
+	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
 			    BTRFS_FSID_SIZE);
 
 	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
@@ -3152,7 +3152,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 	btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(c, root->root_key.objectid);
 
-	write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(c),
+	write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(),
 			    BTRFS_FSID_SIZE);
 
 	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
@@ -3291,7 +3291,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(split, root->root_key.objectid);
 	write_extent_buffer(split, root->fs_info->fsid,
-			    btrfs_header_fsid(split), BTRFS_FSID_SIZE);
+			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
 	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
 			    btrfs_header_chunk_tree_uuid(split),
 			    BTRFS_UUID_SIZE);
@@ -4046,7 +4046,7 @@ again:
 	btrfs_set_header_owner(right, root->root_key.objectid);
 	btrfs_set_header_level(right, 0);
 	write_extent_buffer(right, root->fs_info->fsid,
-			    btrfs_header_fsid(right), BTRFS_FSID_SIZE);
+			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
 
 	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
 			    btrfs_header_chunk_tree_uuid(right),
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ee0b8aa82828..ee8972b9f1ab 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2667,7 +2667,7 @@ static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
 	btrfs_set_header_flags(eb, flags);
 }
 
-static inline unsigned long btrfs_header_fsid(struct extent_buffer *eb)
+static inline unsigned long btrfs_header_fsid(void)
 {
 	return offsetof(struct btrfs_header, fsid);
 }
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b131d716d5a5..ade6c0e79616 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -495,7 +495,7 @@ static int check_tree_block_fsid(struct btrfs_root *root,
 	u8 fsid[BTRFS_UUID_SIZE];
 	int ret = 1;
 
-	read_extent_buffer(eb, fsid, btrfs_header_fsid(eb), BTRFS_FSID_SIZE);
+	read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
 	while (fs_devices) {
 		if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
 			ret = 0;
@@ -1311,7 +1311,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 	btrfs_set_header_owner(leaf, objectid);
 	root->node = leaf;
 
-	write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(leaf),
+	write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(),
 			    BTRFS_FSID_SIZE);
 	write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
 			    btrfs_header_chunk_tree_uuid(leaf),
@@ -1398,7 +1398,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 	root->node = leaf;
 
 	write_extent_buffer(root->node, root->fs_info->fsid,
-			    btrfs_header_fsid(root->node), BTRFS_FSID_SIZE);
+			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
 	btrfs_mark_buffer_dirty(root->node);
 	btrfs_tree_unlock(root->node);
 	return root;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 385c58f2bba1..2c47e1a400a9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -436,7 +436,7 @@ static noinline int create_subvol(struct inode *dir,
 	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(leaf, objectid);
 
-	write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(leaf),
+	write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(),
 			    BTRFS_FSID_SIZE);
 	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
 			    btrfs_header_chunk_tree_uuid(leaf),
-- 
cgit v1.3


From 20dd2cbf01888a91fdd921403040a710b275a1ff Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 25 Sep 2013 21:47:45 +0800
Subject: Btrfs: fix BUG_ON() casued by the reserved space migration

When we did space balance and snapshot creation at the same time, we might
meet the following oops:
 kernel BUG at fs/btrfs/inode.c:3038!
 [SNIP]
 Call Trace:
 [<ffffffffa0411ec7>] btrfs_orphan_cleanup+0x293/0x407 [btrfs]
 [<ffffffffa042dc45>] btrfs_mksubvol.isra.28+0x259/0x373 [btrfs]
 [<ffffffffa042de85>] btrfs_ioctl_snap_create_transid+0x126/0x156 [btrfs]
 [<ffffffffa042dff1>] btrfs_ioctl_snap_create_v2+0xd0/0x121 [btrfs]
 [<ffffffffa0430b2c>] btrfs_ioctl+0x414/0x1854 [btrfs]
 [<ffffffff813b60b7>] ? __do_page_fault+0x305/0x379
 [<ffffffff811215a9>] vfs_ioctl+0x1d/0x39
 [<ffffffff81121d7c>] do_vfs_ioctl+0x32d/0x3e2
 [<ffffffff81057fe7>] ? finish_task_switch+0x80/0xb8
 [<ffffffff81121e88>] SyS_ioctl+0x57/0x83
 [<ffffffff813b39ff>] ? do_device_not_available+0x12/0x14
 [<ffffffff813b99c2>] system_call_fastpath+0x16/0x1b
 [SNIP]
 RIP  [<ffffffffa040da40>] btrfs_orphan_add+0xc3/0x126 [btrfs]

The reason of the problem is that the relocation root creation stole
the reserved space, which was reserved for orphan item deletion.

There are several ways to fix this problem, one is to increasing
the reserved space size of the space balace, and then we can use
that space to create the relocation tree for each fs/file trees.
But it is hard to calculate the suitable size because we doesn't
know how many fs/file trees we need relocate.

We fixed this problem by reserving the space for relocation root creation
actively since the space it need is very small (one tree block, used for
root node copy), then we use that reserved space to create the
relocation tree. If we don't reserve space for relocation tree creation,
we will use the reserved space of the balance.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/relocation.c  |  6 ++++--
 fs/btrfs/transaction.c | 24 +++++++++++++++++++++++-
 fs/btrfs/transaction.h |  1 +
 3 files changed, 28 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0359eecac95d..a945374993f7 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1383,6 +1383,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_root *reloc_root;
 	struct reloc_control *rc = root->fs_info->reloc_ctl;
+	struct btrfs_block_rsv *rsv;
 	int clear_rsv = 0;
 	int ret;
 
@@ -1396,13 +1397,14 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
 	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
 		return 0;
 
-	if (!trans->block_rsv) {
+	if (!trans->reloc_reserved) {
+		rsv = trans->block_rsv;
 		trans->block_rsv = rc->block_rsv;
 		clear_rsv = 1;
 	}
 	reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
 	if (clear_rsv)
-		trans->block_rsv = NULL;
+		trans->block_rsv = rsv;
 
 	ret = __add_reloc_root(reloc_root);
 	BUG_ON(ret < 0);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 134039fd59bb..a213bafe68ec 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -353,6 +353,17 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
 	return 0;
 }
 
+static inline bool need_reserve_reloc_root(struct btrfs_root *root)
+{
+	if (!root->fs_info->reloc_ctl ||
+	    !root->ref_cows ||
+	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+	    root->reloc_root)
+		return false;
+
+	return true;
+}
+
 static struct btrfs_trans_handle *
 start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
 		  enum btrfs_reserve_flush_enum flush)
@@ -360,8 +371,9 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
 	struct btrfs_trans_handle *h;
 	struct btrfs_transaction *cur_trans;
 	u64 num_bytes = 0;
-	int ret;
 	u64 qgroup_reserved = 0;
+	bool reloc_reserved = false;
+	int ret;
 
 	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
 		return ERR_PTR(-EROFS);
@@ -390,6 +402,14 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
 		}
 
 		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+		/*
+		 * Do the reservation for the relocation root creation
+		 */
+		if (unlikely(need_reserve_reloc_root(root))) {
+			num_bytes += root->nodesize;
+			reloc_reserved = true;
+		}
+
 		ret = btrfs_block_rsv_add(root,
 					  &root->fs_info->trans_block_rsv,
 					  num_bytes, flush);
@@ -451,6 +471,7 @@ again:
 	h->delayed_ref_elem.seq = 0;
 	h->type = type;
 	h->allocating_chunk = false;
+	h->reloc_reserved = false;
 	INIT_LIST_HEAD(&h->qgroup_ref_list);
 	INIT_LIST_HEAD(&h->new_bgs);
 
@@ -466,6 +487,7 @@ again:
 					      h->transid, num_bytes, 1);
 		h->block_rsv = &root->fs_info->trans_block_rsv;
 		h->bytes_reserved = num_bytes;
+		h->reloc_reserved = reloc_reserved;
 	}
 	h->qgroup_reserved = qgroup_reserved;
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 306f88ae1de3..7657d115067d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -92,6 +92,7 @@ struct btrfs_trans_handle {
 	short aborted;
 	short adding_csums;
 	bool allocating_chunk;
+	bool reloc_reserved;
 	unsigned int type;
 	/*
 	 * this root is only needed to validate that the root passed to
-- 
cgit v1.3


From fa7c14947abea7b30f661d902c9a515056be6d90 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 26 Sep 2013 13:15:27 +0800
Subject: Btrfs: improve jitter performance of the sequential buffered write

The performance was slowed down sometimes when we ran sysbench to measure
the performance of the sequential buffered write by 2 or more threads.

It was because the write order of the test threads might be confused
by the task scheduler, and the coming write would be beyond the end of
the file, in this case, we need insert dummy file extents and create
a hole for the area we skip. But in order to avoid the ongoing ordered
extents which are in the area, we need wait for them. Unfortunately,
the current code doesn't check if there are ordered extents in the area
or not, try to find and flush the dirty pages directly, but in fact,
there is no dirty page in that area, this step of the current code is
unnecessary, and just wastes time. Sometimes, it would increase
the contention of some locks, and makes the performance slow down suddenly.

So we remove the ordered extent flush function before the check, and flush
the dirty pages and wait for the ordered extents only when we find them.

According to my test, we got 1-2 times of the performance regression when
we ran the test by 10 times before applying this patch. After applying
this patch, the regression went away.

Test Environment:
 CPU:		1CPU * 4Cores
 Memory:	6GB
 Partition:	20GB

Test Command:
 # sysbench --test=fileio --file-total-size=16G --file-test-mode=seqwr \
 > --num-threads=512 --file-block-size=16384 --max-time=60 --max-requests=0 run

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/inode.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c0dcb4c3d72d..1ca49eaba3bb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4239,15 +4239,16 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
-		btrfs_wait_ordered_range(inode, hole_start,
-					 block_end - hole_start);
+
 		lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
 				 &cached_state);
-		ordered = btrfs_lookup_ordered_extent(inode, hole_start);
+		ordered = btrfs_lookup_ordered_range(inode, hole_start,
+						     block_end - hole_start);
 		if (!ordered)
 			break;
 		unlock_extent_cached(io_tree, hole_start, block_end - 1,
 				     &cached_state, GFP_NOFS);
+		btrfs_start_ordered_extent(inode, ordered, 1);
 		btrfs_put_ordered_extent(ordered);
 	}
 
-- 
cgit v1.3


From 7d3d1744f8a7d62e4875bd69cc2192a939813880 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Sun, 29 Sep 2013 10:33:16 +0800
Subject: Btrfs: fix memory leak of chunks' extent map

As we're hold a ref on looking up the extent map, we need to drop the ref
before returning to callers.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/volumes.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 043b215769c2..b691f375d837 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4488,6 +4488,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 		btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got "
 			    "%Lu-%Lu\n", logical, logical+len, em->start,
 			    em->start + em->len);
+		free_extent_map(em);
 		return 1;
 	}
 
@@ -4668,6 +4669,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, "
 			   "found %Lu-%Lu\n", logical, em->start,
 			   em->start + em->len);
+		free_extent_map(em);
 		return -EINVAL;
 	}
 
-- 
cgit v1.3


From 6174d3cb43aa974d0c8590a3e628ac35ab0bbc13 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Tue, 1 Oct 2013 16:13:42 +0100
Subject: Btrfs: remove unused max_key arg from btrfs_search_forward

It is not used for anything.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ctree.c       |  1 -
 fs/btrfs/ctree.h       |  1 -
 fs/btrfs/ioctl.c       | 16 ++--------------
 fs/btrfs/tree-defrag.c |  2 +-
 fs/btrfs/tree-log.c    |  9 ++-------
 fs/btrfs/uuid-tree.c   |  6 +-----
 fs/btrfs/volumes.c     |  2 +-
 7 files changed, 7 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6c5239b55495..33e9dbdd043d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4870,7 +4870,6 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
  * was nothing in the tree that matched the search criteria.
  */
 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
-			 struct btrfs_key *max_key,
 			 struct btrfs_path *path,
 			 u64 min_trans)
 {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ee8972b9f1ab..9ca15a84cc5a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3309,7 +3309,6 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
 			struct btrfs_key *key, int lowest_level,
 			u64 min_trans);
 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
-			 struct btrfs_key *max_key,
 			 struct btrfs_path *path,
 			 u64 min_trans);
 enum btrfs_compare_tree_result {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2c47e1a400a9..98d4ffef961e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -842,7 +842,6 @@ static int find_new_extents(struct btrfs_root *root,
 {
 	struct btrfs_path *path;
 	struct btrfs_key min_key;
-	struct btrfs_key max_key;
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *extent;
 	int type;
@@ -857,15 +856,10 @@ static int find_new_extents(struct btrfs_root *root,
 	min_key.type = BTRFS_EXTENT_DATA_KEY;
 	min_key.offset = *off;
 
-	max_key.objectid = ino;
-	max_key.type = (u8)-1;
-	max_key.offset = (u64)-1;
-
 	path->keep_locks = 1;
 
 	while(1) {
-		ret = btrfs_search_forward(root, &min_key, &max_key,
-					   path, newer_than);
+		ret = btrfs_search_forward(root, &min_key, path, newer_than);
 		if (ret != 0)
 			goto none;
 		if (min_key.objectid != ino)
@@ -1893,7 +1887,6 @@ static noinline int search_ioctl(struct inode *inode,
 {
 	struct btrfs_root *root;
 	struct btrfs_key key;
-	struct btrfs_key max_key;
 	struct btrfs_path *path;
 	struct btrfs_ioctl_search_key *sk = &args->key;
 	struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
@@ -1925,15 +1918,10 @@ static noinline int search_ioctl(struct inode *inode,
 	key.type = sk->min_type;
 	key.offset = sk->min_offset;
 
-	max_key.objectid = sk->max_objectid;
-	max_key.type = sk->max_type;
-	max_key.offset = sk->max_offset;
-
 	path->keep_locks = 1;
 
 	while(1) {
-		ret = btrfs_search_forward(root, &key, &max_key, path,
-					   sk->min_transid);
+		ret = btrfs_search_forward(root, &key, path, sk->min_transid);
 		if (ret != 0) {
 			if (ret > 0)
 				ret = 0;
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 94e05c1f118a..cbaa73c871a9 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -85,7 +85,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 
 	path->keep_locks = 1;
 
-	ret = btrfs_search_forward(root, &key, NULL, path, min_trans);
+	ret = btrfs_search_forward(root, &key, path, min_trans);
 	if (ret < 0)
 		goto out;
 	if (ret > 0) {
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 79f057c0619a..8ac96c261a11 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2867,7 +2867,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 			  u64 min_offset, u64 *last_offset_ret)
 {
 	struct btrfs_key min_key;
-	struct btrfs_key max_key;
 	struct btrfs_root *log = root->log_root;
 	struct extent_buffer *src;
 	int err = 0;
@@ -2879,9 +2878,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	u64 ino = btrfs_ino(inode);
 
 	log = root->log_root;
-	max_key.objectid = ino;
-	max_key.offset = (u64)-1;
-	max_key.type = key_type;
 
 	min_key.objectid = ino;
 	min_key.type = key_type;
@@ -2889,8 +2885,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 
 	path->keep_locks = 1;
 
-	ret = btrfs_search_forward(root, &min_key, &max_key,
-				   path, trans->transid);
+	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
 
 	/*
 	 * we didn't find anything from this transaction, see if there
@@ -3719,7 +3714,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
 	while (1) {
 		ins_nr = 0;
-		ret = btrfs_search_forward(root, &min_key, &max_key,
+		ret = btrfs_search_forward(root, &min_key,
 					   path, trans->transid);
 		if (ret != 0)
 			break;
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index dd0dea3766f7..fbda90004fe9 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -260,7 +260,6 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
 {
 	struct btrfs_root *root = fs_info->uuid_root;
 	struct btrfs_key key;
-	struct btrfs_key max_key;
 	struct btrfs_path *path;
 	int ret = 0;
 	struct extent_buffer *leaf;
@@ -277,13 +276,10 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
 	key.objectid = 0;
 	key.type = 0;
 	key.offset = 0;
-	max_key.objectid = (u64)-1;
-	max_key.type = (u8)-1;
-	max_key.offset = (u64)-1;
 
 again_search_slot:
 	path->keep_locks = 1;
-	ret = btrfs_search_forward(root, &key, &max_key, path, 0);
+	ret = btrfs_search_forward(root, &key, path, 0);
 	if (ret) {
 		if (ret > 0)
 			ret = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b691f375d837..c846cc8eb32c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3488,7 +3488,7 @@ static int btrfs_uuid_scan_kthread(void *data)
 	path->keep_locks = 1;
 
 	while (1) {
-		ret = btrfs_search_forward(root, &key, &max_key, path, 0);
+		ret = btrfs_search_forward(root, &key, path, 0);
 		if (ret) {
 			if (ret > 0)
 				ret = 0;
-- 
cgit v1.3


From 3d41d70252234db153ea1b037052278ff5786ad5 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Tue, 1 Oct 2013 17:06:53 +0100
Subject: Btrfs: remove unnecessary tree search when logging inode

In tree-log.c:btrfs_log_inode(), we keep calling btrfs_search_forward()
until it returns a key whose objectid is higher than our inode or until
the key's type is higher than our maximum allowed type.

At the end of the loop, we increment our mininum search key's objectid
and type regardless of our desired target objectid and maximum desired
type, which causes another loop iteration that will call again
btrfs_search_forward() just to figure out we've gone beyond our maximum
key and exit the loop. Therefore while incrementing our minimum key,
don't do it blindly and exit the loop immiediately if the next search
key's objectid or type is beyond what we seek.

Also after incrementing the type, set the key's offset to 0, which was
missing and could make us loose some of the inode's items.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tree-log.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8ac96c261a11..964c583ea900 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3764,14 +3764,14 @@ next_slot:
 		}
 		btrfs_release_path(path);
 
-		if (min_key.offset < (u64)-1)
+		if (min_key.offset < (u64)-1) {
 			min_key.offset++;
-		else if (min_key.type < (u8)-1)
+		} else if (min_key.type < max_key.type) {
 			min_key.type++;
-		else if (min_key.objectid < (u64)-1)
-			min_key.objectid++;
-		else
+			min_key.offset = 0;
+		} else {
 			break;
+		}
 	}
 	if (ins_nr) {
 		ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
-- 
cgit v1.3


From 778ba82b1796e75e719a52679ae431371ca73988 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Sun, 6 Oct 2013 22:22:33 +0100
Subject: Btrfs: improve inode hash function/inode lookup

Currently the hash value used for adding an inode to the VFS's inode
hash table consists of the plain inode number, which is a 64 bits
integer. This results in hash table buckets (hlist_head lists) with
too many elements for at least 2 important scenarios:

1) When we have many subvolumes. Each subvolume has its own btree
   where its files and directories are added to, and each has its
   own objectid (inode number) namespace. This means that if we have
   N subvolumes, and all have inode number X associated to a file or
   directory, the corresponding inodes all map to the same hash table
   entry, resulting in a bucket (hlist_head list) with N elements;

2) On 32 bits machines. Th VFS hash values are unsigned longs, which
   are 32 bits wide on 32 bits machines, and the inode (objectid)
   numbers are 64 bits unsigned integers. We simply cast the inode
   numbers to hash values, which means that for all inodes with the
   same 32 bits lower half, the same hash bucket is used for all of
   them. For example, all inodes with a number (objectid) between
   0x0000_0000_ffff_ffff and 0xffff_ffff_ffff_ffff will end up in
   the same hash table bucket.

This change ensures the inode's hash value depends both on the
objectid (inode number) and its subvolume's (btree root) objectid.
For 32 bits machines, this change gives better entropy by making
the hash value depend on both the upper and lower 32 bits of the
64 bits hash previously computed.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/btrfs_inode.h | 20 ++++++++++++++++++++
 fs/btrfs/disk-io.c     |  2 +-
 fs/btrfs/inode.c       |  6 ++++--
 3 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 71f074e1870b..ac0b39db27d1 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -19,6 +19,7 @@
 #ifndef __BTRFS_I__
 #define __BTRFS_I__
 
+#include <linux/hash.h>
 #include "extent_map.h"
 #include "extent_io.h"
 #include "ordered-data.h"
@@ -179,6 +180,25 @@ static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
 	return container_of(inode, struct btrfs_inode, vfs_inode);
 }
 
+static inline unsigned long btrfs_inode_hash(u64 objectid,
+					     const struct btrfs_root *root)
+{
+	u64 h = objectid ^ (root->objectid * GOLDEN_RATIO_PRIME);
+
+#if BITS_PER_LONG == 32
+	h = (h >> 32) ^ (h & 0xffffffff);
+#endif
+
+	return (unsigned long)h;
+}
+
+static inline void btrfs_insert_inode_hash(struct inode *inode)
+{
+	unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root);
+
+	__insert_inode_hash(inode, h);
+}
+
 static inline u64 btrfs_ino(struct inode *inode)
 {
 	u64 ino = BTRFS_I(inode)->location.objectid;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ade6c0e79616..d205bddc7776 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2294,7 +2294,7 @@ int open_ctree(struct super_block *sb,
 	       sizeof(struct btrfs_key));
 	set_bit(BTRFS_INODE_DUMMY,
 		&BTRFS_I(fs_info->btree_inode)->runtime_flags);
-	insert_inode_hash(fs_info->btree_inode);
+	btrfs_insert_inode_hash(fs_info->btree_inode);
 
 	spin_lock_init(&fs_info->block_group_cache_lock);
 	fs_info->block_group_cache_tree = RB_ROOT;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1ca49eaba3bb..bb242f2fb51e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4837,10 +4837,12 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
 {
 	struct inode *inode;
 	struct btrfs_iget_args args;
+	unsigned long hashval = btrfs_inode_hash(objectid, root);
+
 	args.ino = objectid;
 	args.root = root;
 
-	inode = iget5_locked(s, objectid, btrfs_find_actor,
+	inode = iget5_locked(s, hashval, btrfs_find_actor,
 			     btrfs_init_locked_inode,
 			     (void *)&args);
 	return inode;
@@ -5460,7 +5462,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 				BTRFS_INODE_NODATASUM;
 	}
 
-	insert_inode_hash(inode);
+	btrfs_insert_inode_hash(inode);
 	inode_tree_add(inode);
 
 	trace_btrfs_inode_new(inode);
-- 
cgit v1.3


From 539f358a30d5113bad81c41a2e7ba8770d6c9f6e Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 7 Oct 2013 13:42:57 +0300
Subject: Btrfs: fix the dev-replace suspend sequence

Replace progresses strictly from lower to higher offsets, and the
progress is tracked in chunks, by storing the physical offset of the
dev_extent which is being copied in the cursor_left field of
btrfs_dev_replace_item.  When we are done copying the chunk,
left_cursor is updated to point one byte past the dev_extent, so that
on resume we can skip the dev_extents that have already been copied.

There is a major bug (which goes all the way back to the inception of
dev-replace in 3.8) in the way left_cursor is bumped: the bump is done
unconditionally, without any regard to the scrub_chunk return value.
On suspend (and also on any kind of error) scrub_chunk returns early,
i.e. without completing the copy.  This leads to us skipping the chunk
that hasn't been fully copied yet when resuming.

Fix this by doing the cursor_left update only if scrub_chunk ret is 0.
(On suspend scrub_chunk returns with -ECANCELED, so this fix covers
both suspend and error cases.)

Cc: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/scrub.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a18e0e23f6a6..f21e2df89bc2 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2717,8 +2717,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 		mutex_unlock(&fs_info->scrub_lock);
 		wake_up(&fs_info->scrub_pause_wait);
 
-		dev_replace->cursor_left = dev_replace->cursor_right;
-		dev_replace->item_needs_writeback = 1;
 		btrfs_put_block_group(cache);
 		if (ret)
 			break;
@@ -2732,6 +2730,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 			break;
 		}
 
+		dev_replace->cursor_left = dev_replace->cursor_right;
+		dev_replace->item_needs_writeback = 1;
+
 		key.offset = found_key.offset + length;
 		btrfs_release_path(path);
 	}
-- 
cgit v1.3


From 80d94fb3df2bf24a500f13181921736fa23b6c3d Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 7 Oct 2013 12:04:28 +0100
Subject: Btrfs: fix memory leaks on transaction commit failure

Structures of the types tree_mod_elem and qgroup_update are allocated
during transaction commit but were not being released if the call to
btrfs_run_delayed_items() returned an error.

Stack trace reported by kmemleak:

unreferenced object 0xffff880679f0b398 (size 128):
  comm "umount", pid 21508, jiffies 4295967793 (age 36718.112s)
  hex dump (first 32 bytes):
    60 b5 f0 79 06 88 ff ff 00 00 00 00 00 00 00 00  `..y............
    00 00 00 00 00 00 00 00 50 1c 00 00 00 00 00 00  ........P.......
  backtrace:
    [<ffffffff81742d26>] kmemleak_alloc+0x26/0x50
    [<ffffffff811889c2>] kmem_cache_alloc_trace+0x112/0x200
    [<ffffffffa046f2d3>] tree_mod_log_insert_key.constprop.45+0x93/0x150 [btrfs]
    [<ffffffffa04720f9>] __btrfs_cow_block+0x299/0x4f0 [btrfs]
    [<ffffffffa0472510>] btrfs_cow_block+0x120/0x1f0 [btrfs]
    [<ffffffffa0476679>] btrfs_search_slot+0x449/0x930 [btrfs]
    [<ffffffffa048eecf>] btrfs_lookup_inode+0x2f/0xa0 [btrfs]
    [<ffffffffa04eb49c>] __btrfs_update_delayed_inode+0x1c/0x1d0 [btrfs]
    [<ffffffffa04eb9e2>] __btrfs_run_delayed_items+0x162/0x1e0 [btrfs]
    [<ffffffffa04eba63>] btrfs_delayed_inode_exit+0x3/0x20 [btrfs]
    [<ffffffffa0499c63>] btrfs_commit_transaction+0x203/0xa50 [btrfs]
    [<ffffffffa046b519>] btrfs_sync_fs+0x69/0x110 [btrfs]
    [<ffffffff811cb210>] __sync_filesystem+0x30/0x60
    [<ffffffff811cb2bb>] sync_filesystem+0x4b/0x70
    [<ffffffff8119ce7b>] generic_shutdown_super+0x3b/0xf0
    [<ffffffff8119cfc6>] kill_anon_super+0x16/0x30
unreferenced object 0xffff880677e0dd88 (size 32):
  comm "umount", pid 21508, jiffies 4295967793 (age 36718.112s)
  hex dump (first 32 bytes):
    78 75 11 a9 06 88 ff ff 00 c0 e0 77 06 88 ff ff  xu.........w....
    40 c3 a2 70 06 88 ff ff 00 00 00 00 00 00 00 00  @..p............
  backtrace:
    [<ffffffff81742d26>] kmemleak_alloc+0x26/0x50
    [<ffffffff811889c2>] kmem_cache_alloc_trace+0x112/0x200
    [<ffffffffa04fa54f>] btrfs_qgroup_record_ref+0xf/0x90 [btrfs]
    [<ffffffffa04e1914>] btrfs_add_delayed_tree_ref+0xf4/0x170 [btrfs]
    [<ffffffffa048518a>] btrfs_free_tree_block+0x9a/0x220 [btrfs]
    [<ffffffffa0472163>] __btrfs_cow_block+0x303/0x4f0 [btrfs]
    [<ffffffffa0472510>] btrfs_cow_block+0x120/0x1f0 [btrfs]
    [<ffffffffa0476679>] btrfs_search_slot+0x449/0x930 [btrfs]
    [<ffffffffa048eecf>] btrfs_lookup_inode+0x2f/0xa0 [btrfs]
    [<ffffffffa04eb49c>] __btrfs_update_delayed_inode+0x1c/0x1d0 [btrfs]
    [<ffffffffa04eb9e2>] __btrfs_run_delayed_items+0x162/0x1e0 [btrfs]
    [<ffffffffa04eba63>] btrfs_delayed_inode_exit+0x3/0x20 [btrfs]
    [<ffffffffa0499c63>] btrfs_commit_transaction+0x203/0xa50 [btrfs]
    [<ffffffffa046b519>] btrfs_sync_fs+0x69/0x110 [btrfs]
    [<ffffffff811cb210>] __sync_filesystem+0x30/0x60
    [<ffffffff811cb2bb>] sync_filesystem+0x4b/0x70

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/transaction.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a213bafe68ec..277fe812d047 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1600,15 +1600,19 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
 	int ret;
 
 	ret = btrfs_run_delayed_items(trans, root);
-	if (ret)
-		return ret;
-
 	/*
 	 * running the delayed items may have added new refs. account
 	 * them now so that they hinder processing of more delayed refs
 	 * as little as possible.
 	 */
-	btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+	if (ret) {
+		btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+		return ret;
+	}
+
+	ret = btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+	if (ret)
+		return ret;
 
 	/*
 	 * rename don't use btrfs_join_transaction, so, once we
-- 
cgit v1.3


From eb58bb371a04d3bbab44ec0c5672ce69487bac1e Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 7 Oct 2013 10:45:07 -0400
Subject: Btrfs: do not free the dirty bytes from the trans block rsv on
 cleanup

The transactions should be cleaning up their reservations on failure, this just
causes us to have warnings on unmount because we go negative by free'ing
reservations that have already been free'ed.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/disk-io.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d205bddc7776..fdc75ab09483 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4046,8 +4046,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 	btrfs_destroy_ordered_operations(cur_trans, root);
 
 	btrfs_destroy_delayed_refs(cur_trans, root);
-	btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
-				cur_trans->dirty_pages.dirty_bytes);
 
 	cur_trans->state = TRANS_STATE_COMMIT_START;
 	wake_up(&root->fs_info->transaction_blocked_wait);
-- 
cgit v1.3


From 681ae50917df9fd1fae2571fb36a579d1e872b12 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 7 Oct 2013 15:11:00 -0400
Subject: Btrfs: cleanup reserved space when freeing tree log on error

On error we will wait and free the tree log at unmount without a transaction.
This means that the actual freeing of the blocks doesn't happen which means we
complain about space leaks on unmount.  So to fix this just skip the transaction
specific cleanup part of the tree log free'ing if we don't have a transaction
and that way we can free up our reserved space and our counters stay happy.
Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tree-log.c | 47 +++++++++++++++++++++++++----------------------
 1 file changed, 25 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 964c583ea900..849b72985c74 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2151,11 +2151,13 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 					return ret;
 				}
 
-				btrfs_tree_lock(next);
-				btrfs_set_lock_blocking(next);
-				clean_tree_block(trans, root, next);
-				btrfs_wait_tree_block_writeback(next);
-				btrfs_tree_unlock(next);
+				if (trans) {
+					btrfs_tree_lock(next);
+					btrfs_set_lock_blocking(next);
+					clean_tree_block(trans, root, next);
+					btrfs_wait_tree_block_writeback(next);
+					btrfs_tree_unlock(next);
+				}
 
 				WARN_ON(root_owner !=
 					BTRFS_TREE_LOG_OBJECTID);
@@ -2227,11 +2229,13 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 
 				next = path->nodes[*level];
 
-				btrfs_tree_lock(next);
-				btrfs_set_lock_blocking(next);
-				clean_tree_block(trans, root, next);
-				btrfs_wait_tree_block_writeback(next);
-				btrfs_tree_unlock(next);
+				if (trans) {
+					btrfs_tree_lock(next);
+					btrfs_set_lock_blocking(next);
+					clean_tree_block(trans, root, next);
+					btrfs_wait_tree_block_writeback(next);
+					btrfs_tree_unlock(next);
+				}
 
 				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
 				ret = btrfs_free_and_pin_reserved_extent(root,
@@ -2301,11 +2305,13 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 
 			next = path->nodes[orig_level];
 
-			btrfs_tree_lock(next);
-			btrfs_set_lock_blocking(next);
-			clean_tree_block(trans, log, next);
-			btrfs_wait_tree_block_writeback(next);
-			btrfs_tree_unlock(next);
+			if (trans) {
+				btrfs_tree_lock(next);
+				btrfs_set_lock_blocking(next);
+				clean_tree_block(trans, log, next);
+				btrfs_wait_tree_block_writeback(next);
+				btrfs_tree_unlock(next);
+			}
 
 			WARN_ON(log->root_key.objectid !=
 				BTRFS_TREE_LOG_OBJECTID);
@@ -2608,13 +2614,10 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
 		.process_func = process_one_buffer
 	};
 
-	if (trans) {
-		ret = walk_log_tree(trans, log, &wc);
-
-		/* I don't think this can happen but just in case */
-		if (ret)
-			btrfs_abort_transaction(trans, log, ret);
-	}
+	ret = walk_log_tree(trans, log, &wc);
+	/* I don't think this can happen but just in case */
+	if (ret)
+		btrfs_abort_transaction(trans, log, ret);
 
 	while (1) {
 		ret = find_first_extent_bit(&log->dirty_log_pages,
-- 
cgit v1.3


From 2b1360da35877cd969ed83884d8989ba778254d0 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 7 Oct 2013 15:14:44 -0400
Subject: Btrfs: free up block groups after everything

If we abort a transaction we will do the tree log cleanup at unmount, but this
happens after we free up the block groups.  This makes all the leak detection
warnings go off because we think we've leaked space but in reality we just
haven't cleaned it up yet.  So instead do the block group cleanup stuff after
free'ing the fs roots so we don't get these warnings.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/disk-io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fdc75ab09483..419968e3eaa0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3637,12 +3637,12 @@ int close_ctree(struct btrfs_root *root)
 		       percpu_counter_sum(&fs_info->delalloc_bytes));
 	}
 
-	btrfs_free_block_groups(fs_info);
-
 	btrfs_stop_all_workers(fs_info);
 
 	del_fs_roots(fs_info);
 
+	btrfs_free_block_groups(fs_info);
+
 	free_root_pointers(fs_info, 1);
 
 	iput(fs_info->btree_inode);
-- 
cgit v1.3


From 0be5dc67c445230b0889dba63defba9a9e5561b4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 7 Oct 2013 15:18:52 -0400
Subject: Btrfs: fixup reserved trace points

In trying to track down where we were leaking reserved space I noticed our
reserve extent tracepoints are a little off.  First we were saying that the
reserved space had been alloced in btrfs_reserve_extent, which isn't the case,
this needs to be triggered when we actually allocate the space when we run the
delayed ref.  We were also missing a few places where we should have been
tracing the btrfs_reserve_extent_free tracepoint.  With these in place I was
able to put together where we were leaking reserved space.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 54ee542c705e..65401d7ef663 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5262,6 +5262,8 @@ static int pin_down_extent(struct btrfs_root *root,
 
 	set_extent_dirty(root->fs_info->pinned_extents, bytenr,
 			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+	if (reserved)
+		trace_btrfs_reserved_extent_free(root, bytenr, num_bytes);
 	return 0;
 }
 
@@ -5965,6 +5967,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 
 		btrfs_add_free_space(cache, buf->start, buf->len);
 		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
+		trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
 		pin = 0;
 	}
 out:
@@ -6592,8 +6595,6 @@ again:
 		}
 	}
 
-	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
-
 	return ret;
 }
 
@@ -6705,6 +6706,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 			ins->objectid, ins->offset);
 		BUG();
 	}
+	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
 	return ret;
 }
 
@@ -6777,6 +6779,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 			ins->objectid, ins->offset);
 		BUG();
 	}
+
+	trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize);
 	return ret;
 }
 
-- 
cgit v1.3


From 857cc2fc29cfaf4ee98fe9967bbf6a3942191136 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 7 Oct 2013 15:21:08 -0400
Subject: Btrfs: free reserved space on error in a few places

While trying to track down a reserved space leak I noticed a few places where we
won't properly clean up reserved space if we have an error, this patch fixes
those up.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 21 +++++++++++++++++++--
 fs/btrfs/inode.c       |  2 ++
 2 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 65401d7ef663..054b11dc8edf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2234,8 +2234,12 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 {
 	int ret = 0;
 
-	if (trans->aborted)
+	if (trans->aborted) {
+		if (insert_reserved)
+			btrfs_pin_extent(root, node->bytenr,
+					 node->num_bytes, 1);
 		return 0;
+	}
 
 	if (btrfs_delayed_ref_is_head(node)) {
 		struct btrfs_delayed_ref_head *head;
@@ -2411,6 +2415,14 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 				btrfs_free_delayed_extent_op(extent_op);
 
 				if (ret) {
+					/*
+					 * Need to reset must_insert_reserved if
+					 * there was an error so the abort stuff
+					 * can cleanup the reserved space
+					 * properly.
+					 */
+					if (must_insert_reserved)
+						locked_ref->must_insert_reserved = 1;
 					btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
 					spin_lock(&delayed_refs->lock);
 					btrfs_delayed_ref_unlock(locked_ref);
@@ -6731,13 +6743,18 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 		size += sizeof(*block_info);
 
 	path = btrfs_alloc_path();
-	if (!path)
+	if (!path) {
+		btrfs_free_and_pin_reserved_extent(root, ins->objectid,
+						   root->leafsize);
 		return -ENOMEM;
+	}
 
 	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
 				      ins, size);
 	if (ret) {
+		btrfs_free_and_pin_reserved_extent(root, ins->objectid,
+						   root->leafsize);
 		btrfs_free_path(path);
 		return ret;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bb242f2fb51e..bba7f1a5e9f3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8498,6 +8498,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 						  ins.offset, 0, 0, 0,
 						  BTRFS_FILE_EXTENT_PREALLOC);
 		if (ret) {
+			btrfs_free_reserved_extent(root, ins.objectid,
+						   ins.offset);
 			btrfs_abort_transaction(trans, root, ret);
 			if (own_trans)
 				btrfs_end_transaction(trans, root);
-- 
cgit v1.3


From 294e30fee35d3151d100cfe59e839c2dbc16a374 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Wed, 9 Oct 2013 12:00:56 -0400
Subject: Btrfs: add tests for find_lock_delalloc_range

So both Liu and I made huge messes of find_lock_delalloc_range trying to fix
stuff, me first by fixing extent size, then him by fixing something I broke and
then me again telling him to fix it a different way.  So this is obviously a
candidate for some testing.  This patch adds a pseudo fs so we can allocate fake
inodes for tests that need an inode or pages.  Then it addes a bunch of tests to
make sure find_lock_delalloc_range is acting the way it is supposed to.  With
this patch and all of our previous patches to find_lock_delalloc_range I am sure
it is working as expected now.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/Makefile                |   2 +-
 fs/btrfs/ctree.h                 |   6 +
 fs/btrfs/extent_io.c             |   9 +-
 fs/btrfs/extent_io.h             |   6 +
 fs/btrfs/super.c                 |  14 +-
 fs/btrfs/tests/btrfs-tests.c     |  68 ++++++++++
 fs/btrfs/tests/btrfs-tests.h     |  15 +++
 fs/btrfs/tests/extent-io-tests.c | 276 +++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/magic.h       |   2 +-
 9 files changed, 389 insertions(+), 9 deletions(-)
 create mode 100644 fs/btrfs/tests/btrfs-tests.c
 create mode 100644 fs/btrfs/tests/extent-io-tests.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 4c7dfbfaa3b3..cac4f2d001a3 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -15,4 +15,4 @@ btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
 
 btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
-	tests/extent-buffer-tests.o
+	tests/extent-buffer-tests.o tests/btrfs-tests.o tests/extent-io-tests.o
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9ca15a84cc5a..2f398062b942 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -47,6 +47,12 @@ extern struct kmem_cache *btrfs_path_cachep;
 extern struct kmem_cache *btrfs_free_space_cachep;
 struct btrfs_ordered_sum;
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+#define STATIC noinline
+#else
+#define STATIC static noinline
+#endif
+
 #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
 
 #define BTRFS_MAX_MIRRORS 3
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2bf6f46fae44..c10291cc4fd1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1598,11 +1598,10 @@ done:
  *
  * 1 is returned if we find something, 0 if nothing was in the tree
  */
-static noinline u64 find_lock_delalloc_range(struct inode *inode,
-					     struct extent_io_tree *tree,
-					     struct page *locked_page,
-					     u64 *start, u64 *end,
-					     u64 max_bytes)
+STATIC u64 find_lock_delalloc_range(struct inode *inode,
+				    struct extent_io_tree *tree,
+				    struct page *locked_page, u64 *start,
+				    u64 *end, u64 max_bytes)
 {
 	u64 delalloc_start;
 	u64 delalloc_end;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 6dbc645f1f3d..f98602eac808 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -345,4 +345,10 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
 			 int mirror_num);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+noinline u64 find_lock_delalloc_range(struct inode *inode,
+				      struct extent_io_tree *tree,
+				      struct page *locked_page, u64 *start,
+				      u64 *end, u64 max_bytes);
+#endif
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0e398657d759..78041e34d15a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1791,10 +1791,20 @@ static int btrfs_run_sanity_tests(void)
 {
 	int ret;
 
-	ret = btrfs_test_free_space_cache();
+	ret = btrfs_init_test_fs();
 	if (ret)
 		return ret;
-	return btrfs_test_extent_buffer_operations();
+
+	ret = btrfs_test_free_space_cache();
+	if (ret)
+		goto out;
+	ret = btrfs_test_extent_buffer_operations();
+	if (ret)
+		goto out;
+	ret = btrfs_test_extent_io();
+out:
+	btrfs_destroy_test_fs();
+	return ret;
 }
 
 static int __init init_btrfs_fs(void)
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
new file mode 100644
index 000000000000..697d527377c1
--- /dev/null
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/magic.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+
+static struct vfsmount *test_mnt = NULL;
+
+static struct dentry *btrfs_test_mount(struct file_system_type *fs_type,
+				       int flags, const char *dev_name,
+				       void *data)
+{
+	return mount_pseudo(fs_type, "btrfs_test:", NULL, NULL, BTRFS_TEST_MAGIC);
+}
+
+static struct file_system_type test_type = {
+	.name		= "btrfs_test_fs",
+	.mount		= btrfs_test_mount,
+	.kill_sb	= kill_anon_super,
+};
+
+struct inode *btrfs_new_test_inode(void)
+{
+	return new_inode(test_mnt->mnt_sb);
+}
+
+int btrfs_init_test_fs(void)
+{
+	int ret;
+
+	ret = register_filesystem(&test_type);
+	if (ret) {
+		printk(KERN_ERR "btrfs: cannot register test file system\n");
+		return ret;
+	}
+
+	test_mnt = kern_mount(&test_type);
+	if (IS_ERR(test_mnt)) {
+		printk(KERN_ERR "btrfs: cannot mount test file system\n");
+		unregister_filesystem(&test_type);
+		return ret;
+	}
+	return 0;
+}
+
+void btrfs_destroy_test_fs(void)
+{
+	kern_unmount(test_mnt);
+	unregister_filesystem(&test_type);
+}
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 04f2cd2ca568..e935fe5291e7 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -25,6 +25,10 @@
 
 int btrfs_test_free_space_cache(void);
 int btrfs_test_extent_buffer_operations(void);
+int btrfs_test_extent_io(void);
+int btrfs_init_test_fs(void);
+void btrfs_destroy_test_fs(void);
+struct inode *btrfs_new_test_inode(void);
 #else
 static inline int btrfs_test_free_space_cache(void)
 {
@@ -34,6 +38,17 @@ static inline int btrfs_test_extent_buffer_operations(void)
 {
 	return 0;
 }
+static inline int btrfs_init_test_fs(void)
+{
+	return 0;
+}
+static inline void btrfs_destroy_test_fs(void)
+{
+}
+static inline int btrfs_test_extent_io(void)
+{
+	return 0;
+}
 #endif
 
 #endif
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
new file mode 100644
index 000000000000..7e99c2f98dd0
--- /dev/null
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -0,0 +1,276 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include "btrfs-tests.h"
+#include "../extent_io.h"
+
+#define PROCESS_UNLOCK		(1 << 0)
+#define PROCESS_RELEASE		(1 << 1)
+#define PROCESS_TEST_LOCKED	(1 << 2)
+
+static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
+				       unsigned long flags)
+{
+	int ret;
+	struct page *pages[16];
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+	int count = 0;
+	int loops = 0;
+
+	while (nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min_t(unsigned long, nr_pages,
+				     ARRAY_SIZE(pages)), pages);
+		for (i = 0; i < ret; i++) {
+			if (flags & PROCESS_TEST_LOCKED &&
+			    !PageLocked(pages[i]))
+				count++;
+			if (flags & PROCESS_UNLOCK && PageLocked(pages[i]))
+				unlock_page(pages[i]);
+			page_cache_release(pages[i]);
+			if (flags & PROCESS_RELEASE)
+				page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+		cond_resched();
+		loops++;
+		if (loops > 100000) {
+			printk(KERN_ERR "stuck in a loop, start %Lu, end %Lu, nr_pages %lu, ret %d\n", start, end, nr_pages, ret);
+			break;
+		}
+	}
+	return count;
+}
+
+static int test_find_delalloc(void)
+{
+	struct inode *inode;
+	struct extent_io_tree tmp;
+	struct page *page;
+	struct page *locked_page = NULL;
+	unsigned long index = 0;
+	u64 total_dirty = 256 * 1024 * 1024;
+	u64 max_bytes = 128 * 1024 * 1024;
+	u64 start, end, test_start;
+	u64 found;
+	int ret = -EINVAL;
+
+	inode = btrfs_new_test_inode();
+	if (!inode) {
+		test_msg("Failed to allocate test inode\n");
+		return -ENOMEM;
+	}
+
+	extent_io_tree_init(&tmp, &inode->i_data);
+
+	/*
+	 * First go through and create and mark all of our pages dirty, we pin
+	 * everything to make sure our pages don't get evicted and screw up our
+	 * test.
+	 */
+	for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) {
+		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+		if (!page) {
+			test_msg("Failed to allocate test page\n");
+			ret = -ENOMEM;
+			goto out;
+		}
+		SetPageDirty(page);
+		if (index) {
+			unlock_page(page);
+		} else {
+			page_cache_get(page);
+			locked_page = page;
+		}
+	}
+
+	/* Test this scenario
+	 * |--- delalloc ---|
+	 * |---  search  ---|
+	 */
+	set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_NOFS);
+	start = 0;
+	end = 0;
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (!found) {
+		test_msg("Should have found at least one delalloc\n");
+		goto out_bits;
+	}
+	if (start != 0 || end != 4095) {
+		test_msg("Expected start 0 end 4095, got start %Lu end %Lu\n",
+			 start, end);
+		goto out_bits;
+	}
+	unlock_extent(&tmp, start, end);
+	unlock_page(locked_page);
+	page_cache_release(locked_page);
+
+	/*
+	 * Test this scenario
+	 *
+	 * |--- delalloc ---|
+	 *           |--- search ---|
+	 */
+	test_start = 64 * 1024 * 1024;
+	locked_page = find_lock_page(inode->i_mapping,
+				     test_start >> PAGE_CACHE_SHIFT);
+	if (!locked_page) {
+		test_msg("Couldn't find the locked page\n");
+		goto out_bits;
+	}
+	set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_NOFS);
+	start = test_start;
+	end = 0;
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (!found) {
+		test_msg("Couldn't find delalloc in our range\n");
+		goto out_bits;
+	}
+	if (start != test_start || end != max_bytes - 1) {
+		test_msg("Expected start %Lu end %Lu, got start %Lu, end "
+			 "%Lu\n", test_start, max_bytes - 1, start, end);
+		goto out_bits;
+	}
+	if (process_page_range(inode, start, end,
+			       PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
+		test_msg("There were unlocked pages in the range\n");
+		goto out_bits;
+	}
+	unlock_extent(&tmp, start, end);
+	/* locked_page was unlocked above */
+	page_cache_release(locked_page);
+
+	/*
+	 * Test this scenario
+	 * |--- delalloc ---|
+	 *                    |--- search ---|
+	 */
+	test_start = max_bytes + 4096;
+	locked_page = find_lock_page(inode->i_mapping, test_start >>
+				     PAGE_CACHE_SHIFT);
+	if (!locked_page) {
+		test_msg("Could'nt find the locked page\n");
+		goto out_bits;
+	}
+	start = test_start;
+	end = 0;
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (found) {
+		test_msg("Found range when we shouldn't have\n");
+		goto out_bits;
+	}
+	if (end != (u64)-1) {
+		test_msg("Did not return the proper end offset\n");
+		goto out_bits;
+	}
+
+	/*
+	 * Test this scenario
+	 * [------- delalloc -------|
+	 * [max_bytes]|-- search--|
+	 *
+	 * We are re-using our test_start from above since it works out well.
+	 */
+	set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_NOFS);
+	start = test_start;
+	end = 0;
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (!found) {
+		test_msg("Didn't find our range\n");
+		goto out_bits;
+	}
+	if (start != test_start || end != total_dirty - 1) {
+		test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
+			 test_start, total_dirty - 1, start, end);
+		goto out_bits;
+	}
+	if (process_page_range(inode, start, end,
+			       PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
+		test_msg("Pages in range were not all locked\n");
+		goto out_bits;
+	}
+	unlock_extent(&tmp, start, end);
+
+	/*
+	 * Now to test where we run into a page that is no longer dirty in the
+	 * range we want to find.
+	 */
+	page = find_get_page(inode->i_mapping, (max_bytes + (1 * 1024 * 1024))
+			     >> PAGE_CACHE_SHIFT);
+	if (!page) {
+		test_msg("Couldn't find our page\n");
+		goto out_bits;
+	}
+	ClearPageDirty(page);
+	page_cache_release(page);
+
+	/* We unlocked it in the previous test */
+	lock_page(locked_page);
+	start = test_start;
+	end = 0;
+	/*
+	 * Currently if we fail to find dirty pages in the delalloc range we
+	 * will adjust max_bytes down to PAGE_CACHE_SIZE and then re-search.  If
+	 * this changes at any point in the future we will need to fix this
+	 * tests expected behavior.
+	 */
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (!found) {
+		test_msg("Didn't find our range\n");
+		goto out_bits;
+	}
+	if (start != test_start && end != test_start + PAGE_CACHE_SIZE - 1) {
+		test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
+			 test_start, test_start + PAGE_CACHE_SIZE - 1, start,
+			 end);
+		goto out_bits;
+	}
+	if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED |
+			       PROCESS_UNLOCK)) {
+		test_msg("Pages in range were not all locked\n");
+		goto out_bits;
+	}
+	ret = 0;
+out_bits:
+	clear_extent_bits(&tmp, 0, total_dirty - 1,
+			  (unsigned long)-1, GFP_NOFS);
+out:
+	if (locked_page)
+		page_cache_release(locked_page);
+	process_page_range(inode, 0, total_dirty - 1,
+			   PROCESS_UNLOCK | PROCESS_RELEASE);
+	iput(inode);
+	return ret;
+}
+
+int btrfs_test_extent_io(void)
+{
+	test_msg("Running find delalloc tests\n");
+	return test_find_delalloc();
+}
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 2944278a8ba7..77c60311a6c6 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -71,6 +71,6 @@
 #define USBDEVICE_SUPER_MAGIC	0x9fa2
 #define MTD_INODE_FS_MAGIC      0x11307854
 #define ANON_INODE_FS_MAGIC	0x09041934
-
+#define BTRFS_TEST_MAGIC	0x73727279
 
 #endif /* __LINUX_MAGIC_H__ */
-- 
cgit v1.3


From aaedb55bc08f384b7f57dbb3222a511baed4decf Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 11 Oct 2013 14:44:09 -0400
Subject: Btrfs: add tests for btrfs_get_extent

I'm going to be removing hole extents in the near future so I wanted to make a
sanity test for btrfs_get_extent to make sure I don't break anything in the
meantime.  This patch just puts btrfs_get_extent through its paces by giving it
a completely unreasonable mapping to look at and make sure it is giving us back
maps that make sense.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/Makefile            |   3 +-
 fs/btrfs/ctree.h             |   4 +
 fs/btrfs/inode.c             |   8 +
 fs/btrfs/super.c             |   3 +
 fs/btrfs/tests/btrfs-tests.c |   8 +-
 fs/btrfs/tests/btrfs-tests.h |   5 +
 fs/btrfs/tests/inode-tests.c | 832 +++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 861 insertions(+), 2 deletions(-)
 create mode 100644 fs/btrfs/tests/inode-tests.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index cac4f2d001a3..1a44e42d602a 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -15,4 +15,5 @@ btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
 
 btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
-	tests/extent-buffer-tests.o tests/btrfs-tests.o tests/extent-io-tests.o
+	tests/extent-buffer-tests.o tests/btrfs-tests.o \
+	tests/extent-io-tests.o tests/inode-tests.o
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2f398062b942..9f5e1cfb0209 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -4034,5 +4034,9 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
 	return signal_pending(current);
 }
 
+/* Sanity test specific functions */
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_destroy_inode(struct inode *inode);
+#endif
 
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bba7f1a5e9f3..b5c2ad8fe2ca 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7795,6 +7795,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	return inode;
 }
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_destroy_inode(struct inode *inode)
+{
+	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
+#endif
+
 static void btrfs_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 78041e34d15a..1f62c1ccab1f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1802,6 +1802,9 @@ static int btrfs_run_sanity_tests(void)
 	if (ret)
 		goto out;
 	ret = btrfs_test_extent_io();
+	if (ret)
+		goto out;
+	ret = btrfs_test_inodes();
 out:
 	btrfs_destroy_test_fs();
 	return ret;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 697d527377c1..757ef00a75a4 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -24,11 +24,17 @@
 
 static struct vfsmount *test_mnt = NULL;
 
+static const struct super_operations btrfs_test_super_ops = {
+	.alloc_inode	= btrfs_alloc_inode,
+	.destroy_inode	= btrfs_test_destroy_inode,
+};
+
 static struct dentry *btrfs_test_mount(struct file_system_type *fs_type,
 				       int flags, const char *dev_name,
 				       void *data)
 {
-	return mount_pseudo(fs_type, "btrfs_test:", NULL, NULL, BTRFS_TEST_MAGIC);
+	return mount_pseudo(fs_type, "btrfs_test:", &btrfs_test_super_ops,
+			    NULL, BTRFS_TEST_MAGIC);
 }
 
 static struct file_system_type test_type = {
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index e935fe5291e7..b353bc806ca0 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -26,6 +26,7 @@
 int btrfs_test_free_space_cache(void);
 int btrfs_test_extent_buffer_operations(void);
 int btrfs_test_extent_io(void);
+int btrfs_test_inodes(void);
 int btrfs_init_test_fs(void);
 void btrfs_destroy_test_fs(void);
 struct inode *btrfs_new_test_inode(void);
@@ -49,6 +50,10 @@ static inline int btrfs_test_extent_io(void)
 {
 	return 0;
 }
+static inline int btrfs_test_inodes(void)
+{
+	return 0;
+}
 #endif
 
 #endif
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
new file mode 100644
index 000000000000..b0fc3bab2a96
--- /dev/null
+++ b/fs/btrfs/tests/inode-tests.c
@@ -0,0 +1,832 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../btrfs_inode.h"
+#include "../disk-io.h"
+#include "../extent_io.h"
+#include "../volumes.h"
+
+static struct btrfs_fs_info *alloc_dummy_fs_info(void)
+{
+	struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
+						GFP_NOFS);
+	if (!fs_info)
+		return fs_info;
+	fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
+				      GFP_NOFS);
+	if (!fs_info->fs_devices) {
+		kfree(fs_info);
+		return NULL;
+	}
+	return fs_info;
+}
+static void free_dummy_root(struct btrfs_root *root)
+{
+	if (!root)
+		return;
+	if (root->fs_info) {
+		kfree(root->fs_info->fs_devices);
+		kfree(root->fs_info);
+	}
+	if (root->node)
+		free_extent_buffer(root->node);
+	kfree(root);
+}
+
+static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
+			  u64 ram_bytes, u64 offset, u64 disk_bytenr,
+			  u64 disk_len, u32 type, u8 compression, int slot)
+{
+	struct btrfs_path path;
+	struct btrfs_file_extent_item *fi;
+	struct extent_buffer *leaf = root->node;
+	struct btrfs_key key;
+	u32 value_len = sizeof(struct btrfs_file_extent_item);
+
+	if (type == BTRFS_FILE_EXTENT_INLINE)
+		value_len += len;
+	memset(&path, 0, sizeof(path));
+
+	path.nodes[0] = leaf;
+	path.slots[0] = slot;
+
+	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = start;
+
+	setup_items_for_insert(root, &path, &key, &value_len, value_len,
+			       value_len + sizeof(struct btrfs_item), 1);
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, fi, 1);
+	btrfs_set_file_extent_type(leaf, fi, type);
+	btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
+	btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_len);
+	btrfs_set_file_extent_offset(leaf, fi, offset);
+	btrfs_set_file_extent_num_bytes(leaf, fi, len);
+	btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
+	btrfs_set_file_extent_compression(leaf, fi, compression);
+	btrfs_set_file_extent_encryption(leaf, fi, 0);
+	btrfs_set_file_extent_other_encoding(leaf, fi, 0);
+}
+
+/*
+ * Build the most complicated map of extents the earth has ever seen.  We want
+ * this so we can test all of the corner cases of btrfs_get_extent.  Here is a
+ * diagram of how the extents will look though this may not be possible we still
+ * want to make sure everything acts normally (the last number is not inclusive)
+ *
+ * [0 - 5][5 -  6][6 - 10][10 - 4096][  4096 - 8192 ][8192 - 12288]
+ * [hole ][inline][ hole ][ regular ][regular1 split][    hole    ]
+ *
+ * [ 12288 - 20480][20480 - 24576][  24576 - 28672  ][28672 - 36864][36864 - 45056]
+ * [regular1 split][   prealloc1 ][prealloc1 written][   prealloc1 ][ compressed  ]
+ *
+ * [45056 - 49152][49152-53248][53248-61440][61440-65536][     65536+81920   ]
+ * [ compressed1 ][  regular  ][compressed1][  regular  ][ hole but no extent]
+ *
+ * [81920-86016]
+ * [  regular  ]
+ */
+static void setup_file_extents(struct btrfs_root *root)
+{
+	int slot = 0;
+	u64 disk_bytenr = 1 * 1024 * 1024;
+	u64 offset = 0;
+
+	/* First we want a hole */
+	insert_extent(root, offset, 5, 5, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
+		      slot);
+	slot++;
+	offset += 5;
+
+	/*
+	 * Now we want an inline extent, I don't think this is possible but hey
+	 * why not?  Also keep in mind if we have an inline extent it counts as
+	 * the whole first page.  If we were to expand it we would have to cow
+	 * and we wouldn't have an inline extent anymore.
+	 */
+	insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0,
+		      slot);
+	slot++;
+	offset = 4096;
+
+	/* Now another hole */
+	insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
+		      slot);
+	slot++;
+	offset += 4;
+
+	/* Now for a regular extent */
+	insert_extent(root, offset, 4095, 4095, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	disk_bytenr += 4096;
+	offset += 4095;
+
+	/*
+	 * Now for 3 extents that were split from a hole punch so we test
+	 * offsets properly.
+	 */
+	insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 4096, 4096, 0, 0, 0, BTRFS_FILE_EXTENT_REG,
+		      0, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	offset += 8192;
+	disk_bytenr += 16384;
+
+	/* Now for a unwritten prealloc extent */
+	insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+	slot++;
+	offset += 4096;
+
+	/*
+	 * We want to jack up disk_bytenr a little more so the em stuff doesn't
+	 * merge our records.
+	 */
+	disk_bytenr += 8192;
+
+	/*
+	 * Now for a partially written prealloc extent, basically the same as
+	 * the hole punch example above.  Ram_bytes never changes when you mark
+	 * extents written btw.
+	 */
+	insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384,
+		      BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 4096, 16384, 4096, disk_bytenr, 16384,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384,
+		      BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+	slot++;
+	offset += 8192;
+	disk_bytenr += 16384;
+
+	/* Now a normal compressed extent */
+	insert_extent(root, offset, 8192, 8192, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+	slot++;
+	offset += 8192;
+	/* No merges */
+	disk_bytenr += 8192;
+
+	/* Now a split compressed extent */
+	insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 4096, 4096, 0, disk_bytenr + 4096, 4096,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+	slot++;
+	offset += 8192;
+	disk_bytenr += 8192;
+
+	/* Now extents that have a hole but no hole extent */
+	insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	offset += 16384;
+	disk_bytenr += 4096;
+	insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+}
+
+static unsigned long prealloc_only = 0;
+static unsigned long compressed_only = 0;
+static unsigned long vacancy_only = 0;
+
+static noinline int test_btrfs_get_extent(void)
+{
+	struct inode *inode = NULL;
+	struct btrfs_root *root = NULL;
+	struct extent_map *em = NULL;
+	u64 orig_start;
+	u64 disk_bytenr;
+	u64 offset;
+	int ret = -ENOMEM;
+
+	set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
+	set_bit(EXTENT_FLAG_VACANCY, &vacancy_only);
+	set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
+
+	inode = btrfs_new_test_inode();
+	if (!inode) {
+		test_msg("Couldn't allocate inode\n");
+		return ret;
+	}
+
+	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+	BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	BTRFS_I(inode)->location.offset = 0;
+
+	root = btrfs_alloc_dummy_root();
+	if (IS_ERR(root)) {
+		test_msg("Couldn't allocate root\n");
+		goto out;
+	}
+
+	/*
+	 * We do this since btrfs_get_extent wants to assign em->bdev to
+	 * root->fs_info->fs_devices->latest_bdev.
+	 */
+	root->fs_info = alloc_dummy_fs_info();
+	if (!root->fs_info) {
+		test_msg("Couldn't allocate dummy fs info\n");
+		goto out;
+	}
+
+	root->node = alloc_dummy_extent_buffer(0, 4096);
+	if (!root->node) {
+		test_msg("Couldn't allocate dummy buffer\n");
+		goto out;
+	}
+
+	/*
+	 * We will just free a dummy node if it's ref count is 2 so we need an
+	 * extra ref so our searches don't accidently release our page.
+	 */
+	extent_buffer_get(root->node);
+	btrfs_set_header_nritems(root->node, 0);
+	btrfs_set_header_level(root->node, 0);
+	ret = -EINVAL;
+
+	/* First with no extents */
+	BTRFS_I(inode)->root = root;
+	em = btrfs_get_extent(inode, NULL, 0, 0, 4096, 0);
+	if (IS_ERR(em)) {
+		em = NULL;
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+		test_msg("Vacancy flag wasn't set properly\n");
+		goto out;
+	}
+	free_extent_map(em);
+	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+
+	/*
+	 * All of the magic numbers are based on the mapping setup in
+	 * setup_file_extents, so if you change anything there you need to
+	 * update the comment and update the expected values below.
+	 */
+	setup_file_extents(root);
+
+	em = btrfs_get_extent(inode, NULL, 0, 0, (u64)-1, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != 0 || em->len != 5) {
+		test_msg("Unexpected extent wanted start 0 len 5, got start "
+			 "%llu len %llu\n", em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_INLINE) {
+		test_msg("Expected an inline, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4091) {
+		test_msg("Unexpected extent wanted start %llu len 1, got start "
+			 "%llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	/*
+	 * We don't test anything else for inline since it doesn't get set
+	 * unless we have a page for it to write into.  Maybe we should change
+	 * this?
+	 */
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4) {
+		test_msg("Unexpected extent wanted start %llu len 4, got start "
+			 "%llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* Regular extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4095) {
+		test_msg("Unexpected extent wanted start %llu len 4095, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* The next 3 are split extents */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	disk_bytenr = em->block_start;
+	orig_start = em->start;
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 8192) {
+		test_msg("Unexpected extent wanted start %llu len 8192, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != orig_start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n",
+			 orig_start, em->orig_start);
+		goto out;
+	}
+	disk_bytenr += (em->start - orig_start);
+	if (em->block_start != disk_bytenr) {
+		test_msg("Wrong block start, want %llu, have %llu\n",
+			 disk_bytenr, em->block_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* Prealloc extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != prealloc_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 prealloc_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* The next 3 are a half written prealloc extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != prealloc_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 prealloc_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	disk_bytenr = em->block_start;
+	orig_start = em->start;
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_HOLE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != orig_start) {
+		test_msg("Unexpected orig offset, wanted %llu, have %llu\n",
+			 orig_start, em->orig_start);
+		goto out;
+	}
+	if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
+		test_msg("Unexpected block start, wanted %llu, have %llu\n",
+			 disk_bytenr + (em->start - em->orig_start),
+			 em->block_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 8192) {
+		test_msg("Unexpected extent wanted start %llu len 8192, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != prealloc_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 prealloc_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != orig_start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", orig_start,
+			 em->orig_start);
+		goto out;
+	}
+	if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
+		test_msg("Unexpected block start, wanted %llu, have %llu\n",
+			 disk_bytenr + (em->start - em->orig_start),
+			 em->block_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* Now for the compressed extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 8192) {
+		test_msg("Unexpected extent wanted start %llu len 8192, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != compressed_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 compressed_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n",
+			 em->start, em->orig_start);
+		goto out;
+	}
+	if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+		test_msg("Unexpected compress type, wanted %d, got %d\n",
+			 BTRFS_COMPRESS_ZLIB, em->compress_type);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* Split compressed extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != compressed_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 compressed_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n",
+			 em->start, em->orig_start);
+		goto out;
+	}
+	if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+		test_msg("Unexpected compress type, wanted %d, got %d\n",
+			 BTRFS_COMPRESS_ZLIB, em->compress_type);
+		goto out;
+	}
+	disk_bytenr = em->block_start;
+	orig_start = em->start;
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != disk_bytenr) {
+		test_msg("Block start does not match, want %llu got %llu\n",
+			 disk_bytenr, em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 8192) {
+		test_msg("Unexpected extent wanted start %llu len 8192, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != compressed_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 compressed_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != orig_start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n",
+			 em->start, orig_start);
+		goto out;
+	}
+	if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+		test_msg("Unexpected compress type, wanted %d, got %d\n",
+			 BTRFS_COMPRESS_ZLIB, em->compress_type);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* A hole between regular extents but no hole extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset + 6, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096 * 1024, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	/*
+	 * Currently we just return a length that we requested rather than the
+	 * length of the actual hole, if this changes we'll have to change this
+	 * test.
+	 */
+	if (em->start != offset || em->len != 12288) {
+		test_msg("Unexpected extent wanted start %llu len 12288, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != vacancy_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 vacancy_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	ret = 0;
+out:
+	if (!IS_ERR(em))
+		free_extent_map(em);
+	iput(inode);
+	free_dummy_root(root);
+	return ret;
+}
+
+int btrfs_test_inodes(void)
+{
+	test_msg("Running btrfs_get_extent tests\n");
+	return test_btrfs_get_extent();
+}
-- 
cgit v1.3


From 96192499c27e8b58d71f4370f29ca86d4ca915d7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Wed, 16 Oct 2013 13:53:28 -0400
Subject: Btrfs: stop all workers after we free block groups

Stefan was hitting a panic in the async worker stuff because we had outstanding
read bios while we were stopping the worker threads.  You could reproduce this
easily if you mount -o nospace_cache and ran generic/273.  This is because the
caching thread stuff is still going and we were stopping all the worker threads.
We need to stop the workers after this work is done, and the free block groups
code will wait for all the caching threads to stop first so we don't run into
this problem.  With this patch we no longer panic.  Thanks,

Cc: stable@vger.kernel.org
Reported-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/disk-io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 419968e3eaa0..b0ea9f4bbc70 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3637,12 +3637,12 @@ int close_ctree(struct btrfs_root *root)
 		       percpu_counter_sum(&fs_info->delalloc_bytes));
 	}
 
-	btrfs_stop_all_workers(fs_info);
-
 	del_fs_roots(fs_info);
 
 	btrfs_free_block_groups(fs_info);
 
+	btrfs_stop_all_workers(fs_info);
+
 	free_root_pointers(fs_info, 1);
 
 	iput(fs_info->btree_inode);
-- 
cgit v1.3


From 25a50341b6269b0622434d9360f0c771d219681a Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 14 Oct 2013 12:08:38 -0400
Subject: Btrfs: handle a missing extent for the first file extent

While trying to kill our hole extents I noticed I was seeing problems where we
seek into a file and then start writing and then try to fiemap that file later.
This is because we search for offset 0, don't find anything and so back up one
slot, which puts us at the inode ref or something like that, which means we goto
not_found and create an extent map for our entire search area.  This isn't quite
what we want, we want to move forward one slot and see if there is an extent
there so we can limit our hole extent.  This patch fixes this problem, I will
add a testcase for this as well.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/inode.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b5c2ad8fe2ca..9d39d08a1555 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5982,7 +5982,14 @@ again:
 	found_type = btrfs_key_type(&found_key);
 	if (found_key.objectid != objectid ||
 	    found_type != BTRFS_EXTENT_DATA_KEY) {
-		goto not_found;
+		/*
+		 * If we backup past the first extent we want to move forward
+		 * and see if there is an extent in front of us, otherwise we'll
+		 * say there is a hole for our whole search range which can
+		 * cause problems.
+		 */
+		extent_end = start;
+		goto next;
 	}
 
 	found_type = btrfs_file_extent_type(leaf, item);
@@ -5997,7 +6004,7 @@ again:
 		size = btrfs_file_extent_inline_len(leaf, item);
 		extent_end = ALIGN(extent_start + size, root->sectorsize);
 	}
-
+next:
 	if (start >= extent_end) {
 		path->slots[0]++;
 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
-- 
cgit v1.3


From 0e30db86a4524f9e655e8cb9335b4da1787db3c1 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 14 Oct 2013 13:15:02 -0400
Subject: Btrfs: add a sanity test for a vacant extent at the front of a file

Btrfs_get_extent was not handling this case properly, add a test to make sure we
don't regress.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tests/inode-tests.c | 133 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 128 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index b0fc3bab2a96..397d1f99a8eb 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -86,6 +86,26 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
 	btrfs_set_file_extent_other_encoding(leaf, fi, 0);
 }
 
+static void insert_inode_item_key(struct btrfs_root *root)
+{
+	struct btrfs_path path;
+	struct extent_buffer *leaf = root->node;
+	struct btrfs_key key;
+	u32 value_len = 0;
+
+	memset(&path, 0, sizeof(path));
+
+	path.nodes[0] = leaf;
+	path.slots[0] = 0;
+
+	key.objectid = BTRFS_INODE_ITEM_KEY;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	setup_items_for_insert(root, &path, &key, &value_len, value_len,
+			       value_len + sizeof(struct btrfs_item), 1);
+}
+
 /*
  * Build the most complicated map of extents the earth has ever seen.  We want
  * this so we can test all of the corner cases of btrfs_get_extent.  Here is a
@@ -236,10 +256,6 @@ static noinline int test_btrfs_get_extent(void)
 	u64 offset;
 	int ret = -ENOMEM;
 
-	set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
-	set_bit(EXTENT_FLAG_VACANCY, &vacancy_only);
-	set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
-
 	inode = btrfs_new_test_inode();
 	if (!inode) {
 		test_msg("Couldn't allocate inode\n");
@@ -825,8 +841,115 @@ out:
 	return ret;
 }
 
+static int test_hole_first(void)
+{
+	struct inode *inode = NULL;
+	struct btrfs_root *root = NULL;
+	struct extent_map *em = NULL;
+	int ret = -ENOMEM;
+
+	inode = btrfs_new_test_inode();
+	if (!inode) {
+		test_msg("Couldn't allocate inode\n");
+		return ret;
+	}
+
+	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+	BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	BTRFS_I(inode)->location.offset = 0;
+
+	root = btrfs_alloc_dummy_root();
+	if (IS_ERR(root)) {
+		test_msg("Couldn't allocate root\n");
+		goto out;
+	}
+
+	root->fs_info = alloc_dummy_fs_info();
+	if (!root->fs_info) {
+		test_msg("Couldn't allocate dummy fs info\n");
+		goto out;
+	}
+
+	root->node = alloc_dummy_extent_buffer(0, 4096);
+	if (!root->node) {
+		test_msg("Couldn't allocate dummy buffer\n");
+		goto out;
+	}
+
+	extent_buffer_get(root->node);
+	btrfs_set_header_nritems(root->node, 0);
+	btrfs_set_header_level(root->node, 0);
+	BTRFS_I(inode)->root = root;
+	ret = -EINVAL;
+
+	/*
+	 * Need a blank inode item here just so we don't confuse
+	 * btrfs_get_extent.
+	 */
+	insert_inode_item_key(root);
+	insert_extent(root, 4096, 4096, 4096, 0, 4096, 4096,
+		      BTRFS_FILE_EXTENT_REG, 0, 1);
+	em = btrfs_get_extent(inode, NULL, 0, 0, 8192, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != 0 || em->len != 4096) {
+		test_msg("Unexpected extent wanted start 0 len 4096, got start "
+			 "%llu len %llu\n", em->start, em->len);
+		goto out;
+	}
+	if (em->flags != vacancy_only) {
+		test_msg("Wrong flags, wanted %lu, have %lu\n", vacancy_only,
+			 em->flags);
+		goto out;
+	}
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, 4096, 8192, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != 4096) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != 4096 || em->len != 4096) {
+		test_msg("Unexpected extent wanted start 4096 len 4096, got "
+			 "start %llu len %llu\n", em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, wanted 0 got %lu\n",
+			 em->flags);
+		goto out;
+	}
+	ret = 0;
+out:
+	if (!IS_ERR(em))
+		free_extent_map(em);
+	iput(inode);
+	free_dummy_root(root);
+	return ret;
+}
+
 int btrfs_test_inodes(void)
 {
+	int ret;
+
+	set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
+	set_bit(EXTENT_FLAG_VACANCY, &vacancy_only);
+	set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
+
 	test_msg("Running btrfs_get_extent tests\n");
-	return test_btrfs_get_extent();
+	ret = test_btrfs_get_extent();
+	if (ret)
+		return ret;
+	test_msg("Running hole first btrfs_get_extent test\n");
+	return test_hole_first();
 }
-- 
cgit v1.3


From ed9e8af88e2551aaa6bf51d8063a2493e2d71597 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 14 Oct 2013 17:23:08 -0400
Subject: Btrfs: fix hole check in log_one_extent

I added an assert to make sure we were looking up aligned offsets for csums and
I tripped it when running xfstests.  This is because log_one_extent was checking
if block_start == 0 for a hole instead of EXTENT_MAP_HOLE.  This worked out fine
in practice it seems, but it adds a lot of extra work that is uneeded.  With
this fix I'm no longer tripping my assert.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tree-log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 849b72985c74..1134aa40b6d1 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3373,7 +3373,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 		btrfs_set_token_file_extent_type(leaf, fi,
 						 BTRFS_FILE_EXTENT_REG,
 						 &token);
-		if (em->block_start == 0)
+		if (em->block_start == EXTENT_MAP_HOLE)
 			skip_csum = true;
 	}
 
-- 
cgit v1.3


From 4277a9c3b3665f2830c55ece015163867b9414cc Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Tue, 15 Oct 2013 09:36:40 -0400
Subject: Btrfs: add an assert to btrfs_lookup_csums_range for alignment

I was hitting weird issues when trying to remove hole extents and it turned out
it was because I was sending non-aligned offsets down to
btrfs_lookup_csums_range.  So add an assert for this in case somebody trips over
this in the future.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/file-item.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 4f53159bdb9d..ae8a5139e0ee 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -329,6 +329,9 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 	u64 csum_end;
 	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 
+	ASSERT(start == ALIGN(start, root->sectorsize) &&
+	       (end + 1) == ALIGN(end + 1, root->sectorsize));
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-- 
cgit v1.3


From 7f4ca37c486733da008778a1f4058fbc194a4fdd Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 18 Oct 2013 11:44:46 -0400
Subject: Btrfs: fix up seek_hole/seek_data handling

Whoever wrote this was braindead.  Also it doesn't work right if you have
VACANCY's since we assumed you would only have that at the end of the file,
which won't be the case in the near future.  I tested this with generic/285 and
generic/286 as well as the btrfs tests that use fssum since it uses
seek_hole/seek_data to verify things are ok.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/file.c | 94 ++++++++++++---------------------------------------------
 1 file changed, 19 insertions(+), 75 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 72da4df53c9a..bf3465cf455a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2405,14 +2405,12 @@ out_reserve_fail:
 static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_map *em;
+	struct extent_map *em = NULL;
 	struct extent_state *cached_state = NULL;
 	u64 lockstart = *offset;
 	u64 lockend = i_size_read(inode);
 	u64 start = *offset;
-	u64 orig_start = *offset;
 	u64 len = i_size_read(inode);
-	u64 last_end = 0;
 	int ret = 0;
 
 	lockend = max_t(u64, root->sectorsize, lockend);
@@ -2429,89 +2427,35 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
 			 &cached_state);
 
-	/*
-	 * Delalloc is such a pain.  If we have a hole and we have pending
-	 * delalloc for a portion of the hole we will get back a hole that
-	 * exists for the entire range since it hasn't been actually written
-	 * yet.  So to take care of this case we need to look for an extent just
-	 * before the position we want in case there is outstanding delalloc
-	 * going on here.
-	 */
-	if (whence == SEEK_HOLE && start != 0) {
-		if (start <= root->sectorsize)
-			em = btrfs_get_extent_fiemap(inode, NULL, 0, 0,
-						     root->sectorsize, 0);
-		else
-			em = btrfs_get_extent_fiemap(inode, NULL, 0,
-						     start - root->sectorsize,
-						     root->sectorsize, 0);
-		if (IS_ERR(em)) {
-			ret = PTR_ERR(em);
-			goto out;
-		}
-		last_end = em->start + em->len;
-		if (em->block_start == EXTENT_MAP_DELALLOC)
-			last_end = min_t(u64, last_end, inode->i_size);
-		free_extent_map(em);
-	}
-
-	while (1) {
+	while (start < inode->i_size) {
 		em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);
 		if (IS_ERR(em)) {
 			ret = PTR_ERR(em);
+			em = NULL;
 			break;
 		}
 
-		if (em->block_start == EXTENT_MAP_HOLE) {
-			if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
-				if (last_end <= orig_start) {
-					free_extent_map(em);
-					ret = -ENXIO;
-					break;
-				}
-			}
-
-			if (whence == SEEK_HOLE) {
-				*offset = start;
-				free_extent_map(em);
-				break;
-			}
-		} else {
-			if (whence == SEEK_DATA) {
-				if (em->block_start == EXTENT_MAP_DELALLOC) {
-					if (start >= inode->i_size) {
-						free_extent_map(em);
-						ret = -ENXIO;
-						break;
-					}
-				}
-
-				if (!test_bit(EXTENT_FLAG_PREALLOC,
-					      &em->flags)) {
-					*offset = start;
-					free_extent_map(em);
-					break;
-				}
-			}
-		}
+		if (whence == SEEK_HOLE &&
+		    (em->block_start == EXTENT_MAP_HOLE ||
+		     test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
+			break;
+		else if (whence == SEEK_DATA &&
+			   (em->block_start != EXTENT_MAP_HOLE &&
+			    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
+			break;
 
 		start = em->start + em->len;
-		last_end = em->start + em->len;
-
-		if (em->block_start == EXTENT_MAP_DELALLOC)
-			last_end = min_t(u64, last_end, inode->i_size);
-
-		if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
-			free_extent_map(em);
-			ret = -ENXIO;
-			break;
-		}
 		free_extent_map(em);
+		em = NULL;
 		cond_resched();
 	}
-	if (!ret)
-		*offset = min(*offset, inode->i_size);
-out:
+	free_extent_map(em);
+	if (!ret) {
+		if (whence == SEEK_DATA && start >= inode->i_size)
+			ret = -ENXIO;
+		else
+			*offset = min_t(loff_t, start, inode->i_size);
+	}
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 			     &cached_state, GFP_NOFS);
 	return ret;
-- 
cgit v1.3


From 452c75c3d2187089f6e846710e6ea7883bf30f8a Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Mon, 7 Oct 2013 10:45:25 -0500
Subject: Btrfs: Simplify the logic in alloc_extent_buffer() for existing
 extent buffer case

alloc_extent_buffer() uses radix_tree_lookup() when radix_tree_insert()
fails with EEXIST. That part of the code is very similar to the code in
find_extent_buffer(). This patch replaces radix_tree_lookup() and
surrounding code in alloc_extent_buffer() with find_extent_buffer().

Note that radix_tree_lookup() does not need to be protected by
tree->buffer_lock. It is protected by eb->refs.

While at it, this patch
  - changes the other usage of radix_tree_lookup() in alloc_extent_buffer()
    with find_extent_buffer() to reduce redundancy.
  - removes the unused argument 'len' to find_extent_buffer().

Signed-Off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Reviewed-by: Zach Brown <zab@redhat.com>

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/disk-io.c   |  3 +--
 fs/btrfs/extent_io.c | 63 ++++++++++++++++++++++------------------------------
 fs/btrfs/extent_io.h |  2 +-
 3 files changed, 28 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b0ea9f4bbc70..ebc784aba3f5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1104,8 +1104,7 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 {
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	struct extent_buffer *eb;
-	eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
-				bytenr, blocksize);
+	eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, bytenr);
 	return eb;
 }
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c10291cc4fd1..5439f24ab06b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4486,6 +4486,23 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb)
 	}
 }
 
+struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+					 		u64 start)
+{
+	struct extent_buffer *eb;
+
+	rcu_read_lock();
+	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+	if (eb && atomic_inc_not_zero(&eb->refs)) {
+		rcu_read_unlock();
+		mark_extent_buffer_accessed(eb);
+		return eb;
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 					  u64 start, unsigned long len)
 {
@@ -4499,14 +4516,10 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 	int uptodate = 1;
 	int ret;
 
-	rcu_read_lock();
-	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-	if (eb && atomic_inc_not_zero(&eb->refs)) {
-		rcu_read_unlock();
-		mark_extent_buffer_accessed(eb);
+
+	eb = find_extent_buffer(tree, start);
+	if (eb)
 		return eb;
-	}
-	rcu_read_unlock();
 
 	eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
 	if (!eb)
@@ -4565,24 +4578,17 @@ again:
 
 	spin_lock(&tree->buffer_lock);
 	ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
+	spin_unlock(&tree->buffer_lock);
+	radix_tree_preload_end();
 	if (ret == -EEXIST) {
-		exists = radix_tree_lookup(&tree->buffer,
-						start >> PAGE_CACHE_SHIFT);
-		if (!atomic_inc_not_zero(&exists->refs)) {
-			spin_unlock(&tree->buffer_lock);
-			radix_tree_preload_end();
-			exists = NULL;
+		exists = find_extent_buffer(tree, start);
+		if (exists)
+			goto free_eb;
+		else
 			goto again;
-		}
-		spin_unlock(&tree->buffer_lock);
-		radix_tree_preload_end();
-		mark_extent_buffer_accessed(exists);
-		goto free_eb;
 	}
 	/* add one reference for the tree */
 	check_buffer_tree_ref(eb);
-	spin_unlock(&tree->buffer_lock);
-	radix_tree_preload_end();
 
 	/*
 	 * there is a race where release page may have
@@ -4613,23 +4619,6 @@ free_eb:
 	return exists;
 }
 
-struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
-					 u64 start, unsigned long len)
-{
-	struct extent_buffer *eb;
-
-	rcu_read_lock();
-	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-	if (eb && atomic_inc_not_zero(&eb->refs)) {
-		rcu_read_unlock();
-		mark_extent_buffer_accessed(eb);
-		return eb;
-	}
-	rcu_read_unlock();
-
-	return NULL;
-}
-
 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
 {
 	struct extent_buffer *eb =
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index f98602eac808..19620c58f096 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -271,7 +271,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
-					 u64 start, unsigned long len);
+					 u64 start);
 void free_extent_buffer(struct extent_buffer *eb);
 void free_extent_buffer_stale(struct extent_buffer *eb);
 #define WAIT_NONE	0
-- 
cgit v1.3


From efd0c4055a26e88689b3c0cf95492dfe147cba67 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 7 Oct 2013 21:20:44 +0100
Subject: Btrfs: remove unnecessary key copy when logging inode

The btrfs_insert_empty_item() function doesn't modify its
key argument.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Reviewed-by: Zach Brown <zab@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tree-log.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1134aa40b6d1..f2e0531dc719 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3170,11 +3170,10 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
 			  struct inode *inode)
 {
 	struct btrfs_inode_item *inode_item;
-	struct btrfs_key key;
 	int ret;
 
-	memcpy(&key, &BTRFS_I(inode)->location, sizeof(key));
-	ret = btrfs_insert_empty_item(trans, log, path, &key,
+	ret = btrfs_insert_empty_item(trans, log, path,
+				      &BTRFS_I(inode)->location,
 				      sizeof(*inode_item));
 	if (ret && ret != -EEXIST)
 		return ret;
-- 
cgit v1.3


From 8319bfe13642ea2d411c885400377d5fc6f32271 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 8 Oct 2013 18:19:14 +0800
Subject: Btrfs: cleanup dead code of defragment

@is_extent is no more needed since we don't defrag extent root.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tree-defrag.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index cbaa73c871a9..76928ca97741 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -37,7 +37,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	int ret = 0;
 	int wret;
 	int level;
-	int is_extent = 0;
 	int next_key_ret = 0;
 	u64 last_ret = 0;
 	u64 min_trans = 0;
@@ -50,7 +49,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	if (root->ref_cows == 0 && !is_extent)
+	if (root->ref_cows == 0)
 		goto out;
 
 	if (btrfs_test_opt(root, SSD))
-- 
cgit v1.3


From 498456d33e2ee5150f045e604e4531b088083e7a Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 8 Oct 2013 18:19:43 +0800
Subject: Btrfs: kill unused code in btrfs_search_forward

After commit de78b51a2852bddccd6535e9e12de65f92787a1e
(btrfs: remove cache only arguments from defrag path), @blockptr is no more
used.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ctree.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 33e9dbdd043d..8f3d6f893585 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4914,10 +4914,8 @@ again:
 		 * If it is too old, old, skip to the next one.
 		 */
 		while (slot < nritems) {
-			u64 blockptr;
 			u64 gen;
 
-			blockptr = btrfs_node_blockptr(cur, slot);
 			gen = btrfs_node_ptr_generation(cur, slot);
 			if (gen < min_trans) {
 				slot++;
-- 
cgit v1.3


From 03b2f08b5f96db5d07dba54ae008e9fdf1396e05 Mon Sep 17 00:00:00 2001
From: "Geyslan G. Bem" <geyslan@gmail.com>
Date: Fri, 11 Oct 2013 15:35:45 -0300
Subject: btrfs: Fix memory leakage in the tree-log.c

In add_inode_ref() function:

Initializes local pointers.

Reduces the logical condition with the __add_inode_ref() return
value by using only one 'goto out'.

Centralizes the exiting, ensuring the freeing of all used memory.

Signed-off-by: Geyslan G. Bem <geyslan@gmail.com>
Reviewed-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tree-log.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f2e0531dc719..d12696db9ada 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1113,11 +1113,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 				  struct extent_buffer *eb, int slot,
 				  struct btrfs_key *key)
 {
-	struct inode *dir;
-	struct inode *inode;
+	struct inode *dir = NULL;
+	struct inode *inode = NULL;
 	unsigned long ref_ptr;
 	unsigned long ref_end;
-	char *name;
+	char *name = NULL;
 	int namelen;
 	int ret;
 	int search_done = 0;
@@ -1150,13 +1150,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 	 * care of the rest
 	 */
 	dir = read_one_inode(root, parent_objectid);
-	if (!dir)
-		return -ENOENT;
+	if (!dir) {
+		ret = -ENOENT;
+		goto out;
+	}
 
 	inode = read_one_inode(root, inode_objectid);
 	if (!inode) {
-		iput(dir);
-		return -EIO;
+		ret = -EIO;
+		goto out;
 	}
 
 	while (ref_ptr < ref_end) {
@@ -1169,14 +1171,16 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 			 */
 			if (!dir)
 				dir = read_one_inode(root, parent_objectid);
-			if (!dir)
-				return -ENOENT;
+			if (!dir) {
+				ret = -ENOENT;
+				goto out;
+			}
 		} else {
 			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
 					     &ref_index);
 		}
 		if (ret)
-			return ret;
+			goto out;
 
 		/* if we already have a perfect match, we're done */
 		if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
@@ -1196,12 +1200,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 						      parent_objectid,
 						      ref_index, name, namelen,
 						      &search_done);
-				if (ret == 1) {
-					ret = 0;
+				if (ret) {
+					if (ret == 1)
+						ret = 0;
 					goto out;
 				}
-				if (ret)
-					goto out;
 			}
 
 			/* insert our name */
@@ -1215,6 +1218,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 
 		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
 		kfree(name);
+		name = NULL;
 		if (log_ref_ver) {
 			iput(dir);
 			dir = NULL;
@@ -1225,6 +1229,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 	ret = overwrite_item(trans, root, path, eb, slot, key);
 out:
 	btrfs_release_path(path);
+	kfree(name);
 	iput(dir);
 	iput(inode);
 	return ret;
-- 
cgit v1.3


From f747cab7b75ba298df3ac234de9f1655957786cc Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 10 Oct 2013 20:37:29 +0300
Subject: Btrfs: nuke a bogus rw_devices decrement in __btrfs_close_devices

On mount failures, __btrfs_close_devices can be called well before
dev-replace state is read and ->is_tgtdev_for_dev_replace is set.  This
leads to a bogus decrement of ->rw_devices and sets off a WARN_ON in
__btrfs_close_devices if replace target device happens to be on the
lists and we fail early in the mount sequence.  Fix this by checking
the devid instead of ->is_tgtdev_for_dev_replace before the decrement:
for replace targets devid is always equal to BTRFS_DEV_REPLACE_DEVID.

Cc: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/volumes.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c846cc8eb32c..9dfe038015f7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -666,7 +666,8 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 		if (device->bdev)
 			fs_devices->open_devices--;
 
-		if (device->writeable && !device->is_tgtdev_for_dev_replace) {
+		if (device->writeable &&
+		    device->devid != BTRFS_DEV_REPLACE_DEVID) {
 			list_del_init(&device->dev_alloc_list);
 			fs_devices->rw_devices--;
 		}
-- 
cgit v1.3


From adfa97cbdfe376b02bb3e1ea3166958fec35ca6f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 10 Oct 2013 20:39:28 +0300
Subject: Btrfs: don't leak ioctl args in btrfs_ioctl_dev_replace

struct btrfs_ioctl_dev_replace_args memory is leaked if replace is
requested on a read-only filesystem.  Fix it.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ioctl.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 98d4ffef961e..08ac53c5f78d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3667,9 +3667,10 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
 
 	switch (p->cmd) {
 	case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
-		if (root->fs_info->sb->s_flags & MS_RDONLY)
-			return -EROFS;
-
+		if (root->fs_info->sb->s_flags & MS_RDONLY) {
+			ret = -EROFS;
+			goto out;
+		}
 		if (atomic_xchg(
 			&root->fs_info->mutually_exclusive_operation_running,
 			1)) {
@@ -3695,7 +3696,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
 
 	if (copy_to_user(arg, p, sizeof(*p)))
 		ret = -EFAULT;
-
+out:
 	kfree(p);
 	return ret;
 }
-- 
cgit v1.3


From e649e587cbc66287b2a4bff8b2113ad679a2b8d8 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 10 Oct 2013 20:40:21 +0300
Subject: Btrfs: disallow 'btrfs {balance,replace} cancel' on ro mounts

For both balance and replace, cancelling involves changing the on-disk
state and committing a transaction, which is not a good thing to do on
read-only filesystems.

Cc: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/dev-replace.c | 3 +++
 fs/btrfs/volumes.c     | 3 +++
 2 files changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 9efb94e95858..98df26181349 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -650,6 +650,9 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 	u64 result;
 	int ret;
 
+	if (fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
 	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
 	btrfs_dev_replace_lock(dev_replace);
 	switch (dev_replace->replace_state) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9dfe038015f7..5d7ea267d8ff 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3424,6 +3424,9 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
 
 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
 {
+	if (fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
 	mutex_lock(&fs_info->balance_mutex);
 	if (!fs_info->balance_ctl) {
 		mutex_unlock(&fs_info->balance_mutex);
-- 
cgit v1.3


From 30d133fc221a0ec50030c33aa7bf6931503894c7 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 11 Oct 2013 16:30:23 +0800
Subject: Btrfs: fixup error path in __btrfs_inc_extent_ref

When we fail to add a reference after a non-inline insertion by some reasons,
eg. ENOSPC, we'll abort the transaction, but we don't return this error to
the caller who has to walk around again to find something wrong, that's
unnecessary.

Also fixup other error paths to keep it simple.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 054b11dc8edf..c4971108f126 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1979,7 +1979,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	struct btrfs_extent_item *item;
 	u64 refs;
 	int ret;
-	int err = 0;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -1992,14 +1991,9 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 					   path, bytenr, num_bytes, parent,
 					   root_objectid, owner, offset,
 					   refs_to_add, extent_op);
-	if (ret == 0)
+	if (ret != -EAGAIN)
 		goto out;
 
-	if (ret != -EAGAIN) {
-		err = ret;
-		goto out;
-	}
-
 	leaf = path->nodes[0];
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 	refs = btrfs_extent_refs(leaf, item);
@@ -2021,7 +2015,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 		btrfs_abort_transaction(trans, root, ret);
 out:
 	btrfs_free_path(path);
-	return err;
+	return ret;
 }
 
 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
-- 
cgit v1.3


From 27087f370174ebb298cdf6dcb36b1e4dcbe34e93 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Fri, 11 Oct 2013 15:20:42 +0200
Subject: Btrfs: init device stats for new devices

Device stats are only initialized (read from tree items) on mount.
Trying to read device stats after adding or replacing new devices will
return errors.

btrfs_init_new_device() and btrfs_init_dev_replace_tgtdev() are the two
functions that allocate and initialize new btrfs_device structures after
a filesystem is mounted. They set the device stats to zero by using
kzalloc() which is correct for new devices. The only missing thing was
to declare these stats as being valid (device->dev_stats_valid = 1) and
this patch adds this missing code.

This is the reproducer:

TEST_DEV1=/dev/sdzzzzz1
TEST_DEV2=/dev/sdzzzzz2
TEST_DEV3=/dev/sdzzzzz3
TEST_MNT=/mnt
mkfs.btrfs $TEST_DEV1
mount $TEST_DEV1 $TEST_MNT
btrfs device add $TEST_DEV2 $TEST_MNT
btrfs device stat $TEST_MNT
btrfs replace start -B $TEST_DEV2 $TEST_DEV3 $TEST_MNT
btrfs device stat $TEST_MNT
umount $TEST_MNT

Reported-by: Ondrej Kunc <kunc88@gmail.com>
Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/volumes.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5d7ea267d8ff..f492f7eabf29 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2042,6 +2042,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	device->in_fs_metadata = 1;
 	device->is_tgtdev_for_dev_replace = 0;
 	device->mode = FMODE_EXCL;
+	device->dev_stats_valid = 1;
 	set_blocksize(device->bdev, 4096);
 
 	if (seeding_dev) {
@@ -2209,6 +2210,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
 	device->in_fs_metadata = 1;
 	device->is_tgtdev_for_dev_replace = 1;
 	device->mode = FMODE_EXCL;
+	device->dev_stats_valid = 1;
 	set_blocksize(device->bdev, 4096);
 	device->fs_devices = fs_info->fs_devices;
 	list_add(&device->dev_list, &fs_info->fs_devices->devices);
-- 
cgit v1.3


From 361c093d7f99c3f6cbb07d5c580ce778ab418c42 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Fri, 11 Oct 2013 17:14:58 +0200
Subject: Btrfs: Wait for uuid-tree rebuild task on remount read-only

If the user remounts the filesystem read-only while the uuid-tree
scan and rebuild task is still running (this happens once after the
filesystem was mounted with an old kernel, or when forced with the
mount options), the remount should wait on the tasks completion
before setting the filesystem read-only. Otherwise the background
task continues to write to the filesystem which is apparently not
what users expect.

The reproducer:

TEST_DEV=/dev/sdzzzzz1
TEST_MNT=/mnt
mkfs.btrfs -f $TEST_DEV
mount $TEST_DEV $TEST_MNT
for i in `seq 50000`; do btrfs subvolume create ${TEST_MNT}/$i; done
umount $TEST_MNT
mount $TEST_DEV $TEST_MNT -o rescan_uuid_tree
sleep 1
ps -elf | fgrep '[btrfs-uuid]' | grep -v grep
mount $TEST_DEV $TEST_MNT -o ro,remount
ps -elf | fgrep '[btrfs-uuid]' | grep -v grep
sleep 1
umount $TEST_MNT

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/super.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1f62c1ccab1f..02f552f11829 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1330,6 +1330,12 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		 * this also happens on 'umount -rf' or on shutdown, when
 		 * the filesystem is busy.
 		 */
+
+		/* wait for the uuid_scan task to finish */
+		down(&fs_info->uuid_tree_rescan_sem);
+		/* avoid complains from lockdep et al. */
+		up(&fs_info->uuid_tree_rescan_sem);
+
 		sb->s_flags |= MS_RDONLY;
 
 		btrfs_dev_replace_suspend_for_unmount(fs_info);
-- 
cgit v1.3


From 3c77bd94ecb4ad2653d1e3eb22295018533a1e21 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Sat, 12 Oct 2013 20:32:59 +0100
Subject: Btrfs: don't leak delayed node on path allocation failure

If the path allocation failed, we would return without decrementing
the reference count in the delayed node we got before, resulting
in a leak.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/delayed-inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index cbd9523ad09c..df1a496bc8d4 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1174,8 +1174,10 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
 	mutex_unlock(&delayed_node->mutex);
 
 	path = btrfs_alloc_path();
-	if (!path)
+	if (!path) {
+		btrfs_release_delayed_node(delayed_node);
 		return -ENOMEM;
+	}
 	path->leave_spinning = 1;
 
 	block_rsv = trans->block_rsv;
-- 
cgit v1.3


From 5ede859b00af261c78b3848a362253c76652e035 Mon Sep 17 00:00:00 2001
From: chandan <chandan@linux.vnet.ibm.com>
Date: Mon, 14 Oct 2013 18:44:39 +0530
Subject: Btrfs: btrfs_add_ordered_operation: Fix last modified transaction
 comparison.

Comparison of an inode's last modified transaction with the last committed
transaction is incorrect. Fix it.

Signed-off-by: chandan <chandan@linux.vnet.ibm.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ordered-data.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index c702cb62f78a..1a36a0c3ee27 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -1076,7 +1076,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
 	 * if this file hasn't been changed since the last transaction
 	 * commit, we can safely return without doing anything
 	 */
-	if (last_mod < root->fs_info->last_trans_committed)
+	if (last_mod <= root->fs_info->last_trans_committed)
 		return;
 
 	spin_lock(&root->fs_info->ordered_root_lock);
-- 
cgit v1.3


From 229eed4348a482c11cf8d494392305a85ed478f9 Mon Sep 17 00:00:00 2001
From: "Geyslan G. Bem" <geyslan@gmail.com>
Date: Mon, 14 Oct 2013 12:18:25 -0300
Subject: btrfs: simplify kmalloc+copy_from_user to memdup_user

Use memdup_user rather than duplicating its implementation
This is a little bit restricted to reduce false positives

The semantic patch that makes this report is available
in scripts/coccinelle/api/memdup_user.cocci.

More information about semantic patching is available at
http://coccinelle.lip6.fr/

Signed-off-by: Geyslan G. Bem <geyslan@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ioctl.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 08ac53c5f78d..864aab4b1bfa 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2715,15 +2715,10 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
 	size = sizeof(tmp) +
 		tmp.dest_count * sizeof(struct btrfs_ioctl_same_extent_info);
 
-	same = kmalloc(size, GFP_NOFS);
-	if (!same) {
-		ret = -EFAULT;
-		goto out;
-	}
+	same = memdup_user((struct btrfs_ioctl_same_args __user *)argp, size);
 
-	if (copy_from_user(same,
-			   (struct btrfs_ioctl_same_args __user *)argp, size)) {
-		ret = -EFAULT;
+	if (IS_ERR(same)) {
+		ret = PTR_ERR(same);
 		goto out;
 	}
 
-- 
cgit v1.3


From e93ae26fe1123e9077e79cc7af8a0d42adf4812f Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 14 Oct 2013 22:49:11 +0100
Subject: Btrfs: optimize tree-log.c:count_inode_refs()

Avoid repeated tree searches by processing all inode ref items in
a leaf at once instead of processing one at a time, followed by a
path release and a tree search for a key with a decremented offset.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tree-log.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d12696db9ada..d45c04b30690 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1312,6 +1312,7 @@ static int count_inode_refs(struct btrfs_root *root,
 				break;
 			path->slots[0]--;
 		}
+process_slot:
 		btrfs_item_key_to_cpu(path->nodes[0], &key,
 				      path->slots[0]);
 		if (key.objectid != ino ||
@@ -1332,6 +1333,10 @@ static int count_inode_refs(struct btrfs_root *root,
 
 		if (key.offset == 0)
 			break;
+		if (path->slots[0] > 0) {
+			path->slots[0]--;
+			goto process_slot;
+		}
 		key.offset--;
 		btrfs_release_path(path);
 	}
-- 
cgit v1.3


From e8b0d724d596f2ac1264ad830a04ef8e415be956 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Tue, 15 Oct 2013 00:12:27 +0100
Subject: Btrfs: fix btrfs_prev_leaf() previous key computation

If we decrement the key type, we must reset its offset to the largest
possible offset (u64)-1. If we decrement the key's objectid, then we
must reset the key's type and offset to their largest possible values,
(u8)-1 and (u64)-1 respectively. Not doing so can make us miss an
items in the tree.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ctree.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 8f3d6f893585..a749121203df 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4827,14 +4827,18 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 
 	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
 
-	if (key.offset > 0)
+	if (key.offset > 0) {
 		key.offset--;
-	else if (key.type > 0)
+	} else if (key.type > 0) {
 		key.type--;
-	else if (key.objectid > 0)
+		key.offset = (u64)-1;
+	} else if (key.objectid > 0) {
 		key.objectid--;
-	else
+		key.type = (u8)-1;
+		key.offset = (u64)-1;
+	} else {
 		return 1;
+	}
 
 	btrfs_release_path(path);
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-- 
cgit v1.3


From ff76b0565523319d7c1c0b51d5a5a8915d33efab Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Tue, 15 Oct 2013 20:08:15 +0200
Subject: Btrfs: Don't allocate inode that is already in use

Due to an off-by-one error, it is possible to reproduce a bug
when the inode cache is used.

The same inode number is assigned twice, the second time this
leads to an EEXIST in btrfs_insert_empty_items().

The issue can happen when a file is removed right after a subvolume
is created and then a new inode number is created before the
inodes in free_inode_pinned are processed.
unlink() calls btrfs_return_ino() which calls start_caching() in this
case which adds [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID] by
searching for the highest inode (which already cannot find the
unlinked one anymore in btrfs_find_free_objectid()). So if this
unlinked inode's number is equal to the highest_ino + 1 (or >= this value
instead of > this value which was the off-by-one error), we mustn't add
the inode number to free_ino_pinned (caching_thread() does it right).
In this case we need to try directly to add the number to the inode_cache
which will fail in this case.

When this inode number is allocated while it is still in free_ino_pinned,
it is allocated and still added to the free inode cache when the
pinned inodes are processed, thus one of the following inode number
allocations will get an inode that is already in use and fail with EEXIST
in btrfs_insert_empty_items().

One example which was created with the reproducer below:
Create a snapshot, work in the newly created snapshot for the rest.
In unlink(inode 34284) call btrfs_return_ino() which calls start_caching().
start_caching() calls add_free_space [34284, 18446744073709517077].
In btrfs_return_ino(), call start_caching pinned [34284, 1] which is wrong.
mkdir() call btrfs_find_ino_for_alloc() which returns the number 34284.
btrfs_unpin_free_ino calls add_free_space [34284, 1].
mkdir() call btrfs_find_ino_for_alloc() which returns the number 34284.
EEXIST when the new inode is inserted.

One possible reproducer is this one:
 #!/bin/sh
 # preparation
TEST_DEV=/dev/sdc1
TEST_MNT=/mnt
umount ${TEST_MNT} 2>/dev/null || true
mkfs.btrfs -f ${TEST_DEV}
mount ${TEST_DEV} ${TEST_MNT} -o \
 rw,relatime,compress=lzo,space_cache,inode_cache
btrfs subv create ${TEST_MNT}/s1
for i in `seq 34027`; do touch ${TEST_MNT}/s1/${i}; done
btrfs subv snap ${TEST_MNT}/s1 ${TEST_MNT}/s2
FILENAME=`find ${TEST_MNT}/s1/ -inum 4085 | sed 's|^.*/\([^/]*\)$|\1|'`
rm ${TEST_MNT}/s2/$FILENAME
touch ${TEST_MNT}/s2/$FILENAME
 # the following steps can be repeated to reproduce the issue again and again
[ -e ${TEST_MNT}/s3 ] && btrfs subv del ${TEST_MNT}/s3
btrfs subv snap ${TEST_MNT}/s2 ${TEST_MNT}/s3
rm ${TEST_MNT}/s3/$FILENAME
touch ${TEST_MNT}/s3/$FILENAME
ls -alFi ${TEST_MNT}/s?/$FILENAME
touch ${TEST_MNT}/s3/_1 || logger FAILED
ls -alFi ${TEST_MNT}/s?/_1
touch ${TEST_MNT}/s3/_2 || logger FAILED
ls -alFi ${TEST_MNT}/s?/_2
touch ${TEST_MNT}/s3/__1 || logger FAILED
ls -alFi ${TEST_MNT}/s?/__1
touch ${TEST_MNT}/s3/__2 || logger FAILED
ls -alFi ${TEST_MNT}/s?/__2
 # if the above is not enough, add the following loop:
for i in `seq 3 9`; do touch ${TEST_MNT}/s3/__${i} || logger FAILED; done
 #for i in `seq 3 34027`; do touch ${TEST_MNT}/s3/__${i} || logger FAILED; done
 # one of the touch(1) calls in s3 fail due to EEXIST because the inode is
 # already in use that btrfs_find_ino_for_alloc() returns.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
Reviewed-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/inode-map.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 014de497e888..ec08004a6a20 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -237,7 +237,7 @@ again:
 		start_caching(root);
 
 		if (objectid <= root->cache_progress ||
-		    objectid > root->highest_objectid)
+		    objectid >= root->highest_objectid)
 			__btrfs_add_free_space(ctl, objectid, 1);
 		else
 			__btrfs_add_free_space(pinned, objectid, 1);
-- 
cgit v1.3


From 8185554d3eb09d23a805456b6fa98dcbb34aa518 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Tue, 15 Oct 2013 18:44:00 +0100
Subject: Btrfs: fix incorrect inode acl reset

When a directory has a default ACL and a subdirectory is created
under that directory, btrfs_init_acl() is called when the
subdirectory's inode is created to initialize the inode's ACL
(inherited from the parent directory) but it was clearing the ACL
from the inode after setting it if posix_acl_create() returned
success, instead of clearing it only if it returned an error.

To reproduce this issue:

$ mkfs.btrfs -f /dev/loop0
$ mount /dev/loop0 /mnt
$ mkdir /mnt/acl
$ setfacl -d --set u::rwx,g::rwx,o::- /mnt/acl
$ getfacl /mnt/acl
user::rwx
group::rwx
other::r-x
default:user::rwx
default:group::rwx
default:other::---

$ mkdir /mnt/acl/dir1
$ getfacl /mnt/acl/dir1
user::rwx
group::rwx
other::---

After unmounting and mounting again the filesystem, fgetacl returned the
expected ACL:

$ umount /mnt/acl
$ mount /dev/loop0 /mnt
$ getfacl /mnt/acl/dir1
user::rwx
group::rwx
other::---
default:user::rwx
default:group::rwx
default:other::---

Meaning that the underlying xattr was persisted.

Reported-by: Giuseppe Fierro <giuseppe@fierro.org>
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/acl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index e15d2b0d8d3b..0890c83643e9 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -229,7 +229,7 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
 		if (ret > 0) {
 			/* we need an acl */
 			ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
-		} else {
+		} else if (ret < 0) {
 			cache_no_acl(inode);
 		}
 	} else {
-- 
cgit v1.3


From 4546bcaeba435c1d0b7f38c011cbb2367497ca8d Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@redhat.com>
Date: Wed, 16 Oct 2013 12:10:32 -0700
Subject: btrfs: use get_seconds() instead of btrfs wrapper

Signed-off-by: Zach Brown <zab@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/dev-replace.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 98df26181349..a36343a2dd68 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -38,7 +38,6 @@
 #include "rcu-string.h"
 #include "dev-replace.h"
 
-static u64 btrfs_get_seconds_since_1970(void);
 static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 				       int scrub_ret);
 static void btrfs_dev_replace_update_device_in_mapping_tree(
@@ -296,13 +295,6 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
 		dev_replace->cursor_left_last_write_of_item;
 }
 
-static u64 btrfs_get_seconds_since_1970(void)
-{
-	struct timespec t = CURRENT_TIME_SEC;
-
-	return t.tv_sec;
-}
-
 int btrfs_dev_replace_start(struct btrfs_root *root,
 			    struct btrfs_ioctl_dev_replace_args *args)
 {
@@ -390,7 +382,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	 * go to the tgtdev as well (refer to btrfs_map_block()).
 	 */
 	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
-	dev_replace->time_started = btrfs_get_seconds_since_1970();
+	dev_replace->time_started = get_seconds();
 	dev_replace->cursor_left = 0;
 	dev_replace->committed_cursor_left = 0;
 	dev_replace->cursor_left_last_write_of_item = 0;
@@ -493,7 +485,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 			  : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
 	dev_replace->tgtdev = NULL;
 	dev_replace->srcdev = NULL;
-	dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+	dev_replace->time_stopped = get_seconds();
 	dev_replace->item_needs_writeback = 1;
 
 	if (scrub_ret) {
@@ -671,7 +663,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 		break;
 	}
 	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
-	dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+	dev_replace->time_stopped = get_seconds();
 	dev_replace->item_needs_writeback = 1;
 	btrfs_dev_replace_unlock(dev_replace);
 	btrfs_scrub_cancel(fs_info);
@@ -706,7 +698,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 		dev_replace->replace_state =
 			BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
-		dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+		dev_replace->time_stopped = get_seconds();
 		dev_replace->item_needs_writeback = 1;
 		pr_info("btrfs: suspending dev_replace for unmount\n");
 		break;
-- 
cgit v1.3


From 1877e1a747db33d7264d8046e96373962da72a3d Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@redhat.com>
Date: Wed, 16 Oct 2013 12:10:33 -0700
Subject: btrfs: remove move_pages()

move_pages() has an inefficient backwards byte copy of regions of two
different pages.  They're different pages so the regions won't overlap
and it could use memcpy().

At that point, though, move_pages() would be a slightly dimmer
re-implementation of copy_pages() that lacked the test for overlapping
page regions.

So remove move_pages() and just call copy_pages().

Signed-off-by: Zach Brown <zab@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent_io.c | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5439f24ab06b..5bf98d2ee70f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -5082,23 +5082,6 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
 	}
 }
 
-static void move_pages(struct page *dst_page, struct page *src_page,
-		       unsigned long dst_off, unsigned long src_off,
-		       unsigned long len)
-{
-	char *dst_kaddr = page_address(dst_page);
-	if (dst_page == src_page) {
-		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
-	} else {
-		char *src_kaddr = page_address(src_page);
-		char *p = dst_kaddr + dst_off + len;
-		char *s = src_kaddr + src_off + len;
-
-		while (len--)
-			*--p = *--s;
-	}
-}
-
 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
 {
 	unsigned long distance = (src > dst) ? src - dst : dst - src;
@@ -5209,7 +5192,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 
 		cur = min_t(unsigned long, len, src_off_in_page + 1);
 		cur = min(cur, dst_off_in_page + 1);
-		move_pages(extent_buffer_page(dst, dst_i),
+		copy_pages(extent_buffer_page(dst, dst_i),
 			   extent_buffer_page(dst, src_i),
 			   dst_off_in_page - cur + 1,
 			   src_off_in_page - cur + 1, cur);
-- 
cgit v1.3


From 8b558c5f097b636209b654f4d7775ac96054d6e3 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@redhat.com>
Date: Wed, 16 Oct 2013 12:10:34 -0700
Subject: btrfs: remove fs/btrfs/compat.h

fs/btrfs/compat.h only contained trivial macro wrappers of drop_nlink()
and inc_nlink().  This doesn't belong in mainline.

Signed-off-by: Zach Brown <zab@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/compat.h      | 7 -------
 fs/btrfs/compression.c | 1 -
 fs/btrfs/dev-replace.c | 1 -
 fs/btrfs/disk-io.c     | 1 -
 fs/btrfs/export.c      | 1 -
 fs/btrfs/extent-tree.c | 1 -
 fs/btrfs/extent_io.c   | 1 -
 fs/btrfs/file.c        | 1 -
 fs/btrfs/inode.c       | 5 ++---
 fs/btrfs/ioctl.c       | 1 -
 fs/btrfs/raid56.c      | 1 -
 fs/btrfs/super.c       | 1 -
 fs/btrfs/tree-log.c    | 9 ++++-----
 fs/btrfs/volumes.c     | 1 -
 14 files changed, 6 insertions(+), 26 deletions(-)
 delete mode 100644 fs/btrfs/compat.h

(limited to 'fs')

diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
deleted file mode 100644
index 7c4503ef6efd..000000000000
--- a/fs/btrfs/compat.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _COMPAT_H_
-#define _COMPAT_H_
-
-#define btrfs_drop_nlink(inode) drop_nlink(inode)
-#define btrfs_inc_nlink(inode)	inc_nlink(inode)
-
-#endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 6aad98cb343f..df019c7f58b2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -32,7 +32,6 @@
 #include <linux/writeback.h>
 #include <linux/bit_spinlock.h>
 #include <linux/slab.h>
-#include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index a36343a2dd68..cb94310db072 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -26,7 +26,6 @@
 #include <linux/kthread.h>
 #include <linux/math64.h>
 #include <asm/div64.h>
-#include "compat.h"
 #include "ctree.h"
 #include "extent_map.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ebc784aba3f5..62c4aba221bb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -33,7 +33,6 @@
 #include <linux/uuid.h>
 #include <linux/semaphore.h>
 #include <asm/unaligned.h>
-#include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 4b8691607373..41422a3de8ed 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -5,7 +5,6 @@
 #include "btrfs_inode.h"
 #include "print-tree.h"
 #include "export.h"
-#include "compat.h"
 
 #define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
 						 parent_objectid) / 4)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c4971108f126..4062a65f9fb7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -25,7 +25,6 @@
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/percpu_counter.h>
-#include "compat.h"
 #include "hash.h"
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5bf98d2ee70f..a93bab452183 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -13,7 +13,6 @@
 #include <linux/cleancache.h>
 #include "extent_io.h"
 #include "extent_map.h"
-#include "compat.h"
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "volumes.h"
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index bf3465cf455a..14b41d569a3e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -39,7 +39,6 @@
 #include "print-tree.h"
 #include "tree-log.h"
 #include "locking.h"
-#include "compat.h"
 #include "volumes.h"
 
 static struct kmem_cache *btrfs_inode_defrag_cachep;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9d39d08a1555..14c6ab74cf8c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -43,7 +43,6 @@
 #include <linux/btrfs.h>
 #include <linux/blkdev.h>
 #include <linux/posix_acl_xattr.h>
-#include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -3645,7 +3644,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 	int ret;
 	ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
 	if (!ret) {
-		btrfs_drop_nlink(inode);
+		drop_nlink(inode);
 		ret = btrfs_update_inode(trans, root, inode);
 	}
 	return ret;
@@ -5738,7 +5737,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		goto fail;
 	}
 
-	btrfs_inc_nlink(inode);
+	inc_nlink(inode);
 	inode_inc_iversion(inode);
 	inode->i_ctime = CURRENT_TIME;
 	ihold(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 864aab4b1bfa..9ea05506cb84 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -44,7 +44,6 @@
 #include <linux/uuid.h>
 #include <linux/btrfs.h>
 #include <linux/uaccess.h>
-#include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index d0ecfbd9cc9f..24ac21840a9a 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -33,7 +33,6 @@
 #include <linux/raid/xor.h>
 #include <linux/vmalloc.h>
 #include <asm/div64.h>
-#include "compat.h"
 #include "ctree.h"
 #include "extent_map.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 02f552f11829..c9079db7441e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -42,7 +42,6 @@
 #include <linux/cleancache.h>
 #include <linux/ratelimit.h>
 #include <linux/btrfs.h>
-#include "compat.h"
 #include "delayed-inode.h"
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d45c04b30690..f98002ed32b1 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -26,7 +26,6 @@
 #include "locking.h"
 #include "print-tree.h"
 #include "backref.h"
-#include "compat.h"
 #include "tree-log.h"
 #include "hash.h"
 
@@ -936,7 +935,7 @@ again:
 					    parent_objectid,
 					    victim_name,
 					    victim_name_len)) {
-				btrfs_inc_nlink(inode);
+				inc_nlink(inode);
 				btrfs_release_path(path);
 
 				ret = btrfs_unlink_inode(trans, root, dir,
@@ -1006,7 +1005,7 @@ again:
 				victim_parent = read_one_inode(root,
 							       parent_objectid);
 				if (victim_parent) {
-					btrfs_inc_nlink(inode);
+					inc_nlink(inode);
 					btrfs_release_path(path);
 
 					ret = btrfs_unlink_inode(trans, root,
@@ -1490,7 +1489,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 		if (!inode->i_nlink)
 			set_nlink(inode, 1);
 		else
-			btrfs_inc_nlink(inode);
+			inc_nlink(inode);
 		ret = btrfs_update_inode(trans, root, inode);
 	} else if (ret == -EEXIST) {
 		ret = 0;
@@ -1851,7 +1850,7 @@ again:
 				goto out;
 			}
 
-			btrfs_inc_nlink(inode);
+			inc_nlink(inode);
 			ret = btrfs_unlink_inode(trans, root, dir, inode,
 						 name, name_len);
 			if (!ret)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f492f7eabf29..716abee16a5b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -28,7 +28,6 @@
 #include <linux/raid/pq.h>
 #include <linux/semaphore.h>
 #include <asm/div64.h>
-#include "compat.h"
 #include "ctree.h"
 #include "extent_map.h"
 #include "disk-io.h"
-- 
cgit v1.3


From cab45e22da48b523665b3566f43527a2fead1d1e Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Wed, 16 Oct 2013 16:27:01 -0400
Subject: btrfs: add tracing for failed reservations

When debugging ENOSPC issues, it's nice to be able to see which
reservations failed as well as the ones which succeeded.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4062a65f9fb7..a807acc9a63e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3684,6 +3684,9 @@ commit_trans:
 			goto again;
 		}
 
+		trace_btrfs_space_reservation(root->fs_info,
+					      "space_info:enospc",
+					      data_sinfo->flags, bytes, 1);
 		return -ENOSPC;
 	}
 	data_sinfo->bytes_may_use += bytes;
@@ -4335,6 +4338,10 @@ out:
 		    !block_rsv_use_bytes(global_rsv, orig_bytes))
 			ret = 0;
 	}
+	if (ret == -ENOSPC)
+		trace_btrfs_space_reservation(root->fs_info,
+					      "space_info:enospc",
+					      space_info->flags, orig_bytes, 1);
 	if (flushing) {
 		spin_lock(&space_info->lock);
 		space_info->flush = 0;
-- 
cgit v1.3


From 55994887082d6e9b3d31b55fc5be158fd97eadbc Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Fri, 18 Oct 2013 15:42:56 +0100
Subject: Btrfs: optimize extent item search in run_delayed_extent_op

Instead of doing another extent tree search if the first search failed
to find a metadata item, check if the previous item in the leaf is an
extent item and use it if it is, otherwise do the second tree search
for an extent item.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a807acc9a63e..4595a65db729 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2130,15 +2130,28 @@ again:
 	}
 	if (ret > 0) {
 		if (metadata) {
-			btrfs_release_path(path);
-			metadata = 0;
+			if (path->slots[0] > 0) {
+				path->slots[0]--;
+				btrfs_item_key_to_cpu(path->nodes[0], &key,
+						      path->slots[0]);
+				if (key.objectid == node->bytenr &&
+				    key.type == BTRFS_EXTENT_ITEM_KEY &&
+				    key.offset == node->num_bytes)
+					ret = 0;
+			}
+			if (ret > 0) {
+				btrfs_release_path(path);
+				metadata = 0;
 
-			key.offset = node->num_bytes;
-			key.type = BTRFS_EXTENT_ITEM_KEY;
-			goto again;
+				key.objectid = node->bytenr;
+				key.offset = node->num_bytes;
+				key.type = BTRFS_EXTENT_ITEM_KEY;
+				goto again;
+			}
+		} else {
+			err = -EIO;
+			goto out;
 		}
-		err = -EIO;
-		goto out;
 	}
 
 	leaf = path->nodes[0];
-- 
cgit v1.3


From a5f519c91df8d81b3e6ed3489806ba66f6b82ca7 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Mon, 21 Oct 2013 14:18:17 +0200
Subject: Btrfs: fix check_int 'leaf item out of bounce' regression

Yet another cleanup patch broke code for which no xfstest exists.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/check-integrity.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 1c47be187240..3281f7ebc0f7 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1038,7 +1038,7 @@ leaf_item_out_of_bounce_error:
 						     disk_item_offset,
 						     sizeof(struct btrfs_item));
 			item_offset = btrfs_stack_item_offset(&disk_item);
-			item_size = btrfs_stack_item_offset(&disk_item);
+			item_size = btrfs_stack_item_size(&disk_item);
 			disk_key = &disk_item.key;
 			type = btrfs_disk_key_type(disk_key);
 
-- 
cgit v1.3


From 301993a4a13b34c13f7f2a7ed3429d231a6d6166 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Mon, 21 Oct 2013 18:46:58 +0200
Subject: Btrfs: check_int, remove warning for mixed-mode

In mixed-mode, when a data-block was later reused for metadata, a
warning was printed. This condition is now filtered out and the
warning is eliminated in this case.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/check-integrity.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 3281f7ebc0f7..656b07637e39 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1900,7 +1900,9 @@ again:
 							       dev_state,
 							       dev_bytenr);
 			}
-			if (block->logical_bytenr != bytenr) {
+			if (block->logical_bytenr != bytenr &&
+			    !(!block->is_metadata &&
+			      block->logical_bytenr == 0))
 				printk(KERN_INFO
 				       "Written block @%llu (%s/%llu/%d)"
 				       " found in hash table, %c,"
@@ -1910,15 +1912,14 @@ again:
 				       block->mirror_num,
 				       btrfsic_get_block_type(state, block),
 				       block->logical_bytenr);
-				block->logical_bytenr = bytenr;
-			} else if (state->print_mask &
-				   BTRFSIC_PRINT_MASK_VERBOSE)
+			else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
 				printk(KERN_INFO
 				       "Written block @%llu (%s/%llu/%d)"
 				       " found in hash table, %c.\n",
 				       bytenr, dev_state->name, dev_bytenr,
 				       block->mirror_num,
 				       btrfsic_get_block_type(state, block));
+			block->logical_bytenr = bytenr;
 		} else {
 			if (num_pages * PAGE_CACHE_SIZE <
 			    state->datablock_size) {
-- 
cgit v1.3


From ed2590953bd06b892f0411fc94e19175d32f197a Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 25 Oct 2013 11:36:01 -0400
Subject: Btrfs: stop using vfs_read in send

Apparently we don't actually close the files until we return to userspace, so
stop using vfs_read in send.  This is actually better for us since we can avoid
all the extra logic of holding the file we're sending open and making sure to
clean it up.  This will fix people who have been hitting too many files open
errors when trying to send.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/send.c | 175 +++++++++++++++++++++++---------------------------------
 1 file changed, 72 insertions(+), 103 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 0a894399be16..e26a3a62fd3f 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -121,7 +121,6 @@ struct send_ctx {
 	struct list_head name_cache_list;
 	int name_cache_size;
 
-	struct file *cur_inode_filp;
 	char *read_buf;
 };
 
@@ -2119,77 +2118,6 @@ out:
 	return ret;
 }
 
-/*
- * Called for regular files when sending extents data. Opens a struct file
- * to read from the file.
- */
-static int open_cur_inode_file(struct send_ctx *sctx)
-{
-	int ret = 0;
-	struct btrfs_key key;
-	struct path path;
-	struct inode *inode;
-	struct dentry *dentry;
-	struct file *filp;
-	int new = 0;
-
-	if (sctx->cur_inode_filp)
-		goto out;
-
-	key.objectid = sctx->cur_ino;
-	key.type = BTRFS_INODE_ITEM_KEY;
-	key.offset = 0;
-
-	inode = btrfs_iget(sctx->send_root->fs_info->sb, &key, sctx->send_root,
-			&new);
-	if (IS_ERR(inode)) {
-		ret = PTR_ERR(inode);
-		goto out;
-	}
-
-	dentry = d_obtain_alias(inode);
-	inode = NULL;
-	if (IS_ERR(dentry)) {
-		ret = PTR_ERR(dentry);
-		goto out;
-	}
-
-	path.mnt = sctx->mnt;
-	path.dentry = dentry;
-	filp = dentry_open(&path, O_RDONLY | O_LARGEFILE, current_cred());
-	dput(dentry);
-	dentry = NULL;
-	if (IS_ERR(filp)) {
-		ret = PTR_ERR(filp);
-		goto out;
-	}
-	sctx->cur_inode_filp = filp;
-
-out:
-	/*
-	 * no xxxput required here as every vfs op
-	 * does it by itself on failure
-	 */
-	return ret;
-}
-
-/*
- * Closes the struct file that was created in open_cur_inode_file
- */
-static int close_cur_inode_file(struct send_ctx *sctx)
-{
-	int ret = 0;
-
-	if (!sctx->cur_inode_filp)
-		goto out;
-
-	ret = filp_close(sctx->cur_inode_filp, NULL);
-	sctx->cur_inode_filp = NULL;
-
-out:
-	return ret;
-}
-
 /*
  * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
  */
@@ -3622,6 +3550,72 @@ out:
 	return ret;
 }
 
+static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
+{
+	struct btrfs_root *root = sctx->send_root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct inode *inode;
+	struct page *page;
+	char *addr;
+	struct btrfs_key key;
+	pgoff_t index = offset >> PAGE_CACHE_SHIFT;
+	pgoff_t last_index;
+	unsigned pg_offset = offset & ~PAGE_CACHE_MASK;
+	ssize_t ret = 0;
+
+	key.objectid = sctx->cur_ino;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	if (offset + len > i_size_read(inode)) {
+		if (offset > i_size_read(inode))
+			len = 0;
+		else
+			len = offset - i_size_read(inode);
+	}
+	if (len == 0)
+		goto out;
+
+	last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+	while (index <= last_index) {
+		unsigned cur_len = min_t(unsigned, len,
+					 PAGE_CACHE_SIZE - pg_offset);
+		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+		if (!page) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		if (!PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				unlock_page(page);
+				page_cache_release(page);
+				ret = -EIO;
+				break;
+			}
+		}
+
+		addr = kmap(page);
+		memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len);
+		kunmap(page);
+		unlock_page(page);
+		page_cache_release(page);
+		index++;
+		pg_offset = 0;
+		len -= cur_len;
+		ret += cur_len;
+	}
+out:
+	iput(inode);
+	return ret;
+}
+
 /*
  * Read some bytes from the current inode/file and send a write command to
  * user space.
@@ -3630,35 +3624,20 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
 {
 	int ret = 0;
 	struct fs_path *p;
-	loff_t pos = offset;
-	int num_read = 0;
-	mm_segment_t old_fs;
+	ssize_t num_read = 0;
 
 	p = fs_path_alloc();
 	if (!p)
 		return -ENOMEM;
 
-	/*
-	 * vfs normally only accepts user space buffers for security reasons.
-	 * we only read from the file and also only provide the read_buf buffer
-	 * to vfs. As this buffer does not come from a user space call, it's
-	 * ok to temporary allow kernel space buffers.
-	 */
-	old_fs = get_fs();
-	set_fs(KERNEL_DS);
-
 verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
 
-	ret = open_cur_inode_file(sctx);
-	if (ret < 0)
-		goto out;
-
-	ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos);
-	if (ret < 0)
-		goto out;
-	num_read = ret;
-	if (!num_read)
+	num_read = fill_read_buf(sctx, offset, len);
+	if (num_read <= 0) {
+		if (num_read < 0)
+			ret = num_read;
 		goto out;
+	}
 
 	ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
 	if (ret < 0)
@@ -3677,7 +3656,6 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
 tlv_put_failure:
 out:
 	fs_path_free(p);
-	set_fs(old_fs);
 	if (ret < 0)
 		return ret;
 	return num_read;
@@ -4222,10 +4200,6 @@ static int changed_inode(struct send_ctx *sctx,
 	u64 left_gen = 0;
 	u64 right_gen = 0;
 
-	ret = close_cur_inode_file(sctx);
-	if (ret < 0)
-		goto out;
-
 	sctx->cur_ino = key->objectid;
 	sctx->cur_inode_new_gen = 0;
 
@@ -4686,11 +4660,6 @@ static int send_subvol(struct send_ctx *sctx)
 	}
 
 out:
-	if (!ret)
-		ret = close_cur_inode_file(sctx);
-	else
-		close_cur_inode_file(sctx);
-
 	free_recorded_refs(sctx);
 	return ret;
 }
-- 
cgit v1.3


From 0ef8b726075aa6931ddf1c16f5bae043eef184f9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 25 Oct 2013 16:13:35 -0400
Subject: Btrfs: return an error from btrfs_wait_ordered_range

I noticed that if the free space cache has an error writing out it's data it
won't actually error out, it will just carry on.  This is because it doesn't
check the return value of btrfs_wait_ordered_range, which didn't actually return
anything.  So fix this in order to keep us from making free space cache look
valid when it really isnt.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/file.c             | 66 ++++++++++++++++++++++++++-------------------
 fs/btrfs/free-space-cache.c |  9 +++++--
 fs/btrfs/inode.c            |  9 +++++--
 fs/btrfs/ordered-data.c     | 26 ++++++++++++------
 fs/btrfs/ordered-data.h     |  2 +-
 fs/btrfs/relocation.c       |  7 ++++-
 6 files changed, 78 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 14b41d569a3e..3a20a12513b2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1280,6 +1280,7 @@ again:
 		}
 		wait_on_page_writeback(pages[i]);
 	}
+	faili = num_pages - 1;
 	err = 0;
 	if (start_pos < inode->i_size) {
 		struct btrfs_ordered_extent *ordered;
@@ -1298,8 +1299,10 @@ again:
 				unlock_page(pages[i]);
 				page_cache_release(pages[i]);
 			}
-			btrfs_wait_ordered_range(inode, start_pos,
-						 last_pos - start_pos);
+			err = btrfs_wait_ordered_range(inode, start_pos,
+						       last_pos - start_pos);
+			if (err)
+				goto fail;
 			goto again;
 		}
 		if (ordered)
@@ -1808,8 +1811,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	atomic_inc(&root->log_batch);
 	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 			     &BTRFS_I(inode)->runtime_flags);
-	if (full_sync)
-		btrfs_wait_ordered_range(inode, start, end - start + 1);
+	if (full_sync) {
+		ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
+		if (ret) {
+			mutex_unlock(&inode->i_mutex);
+			goto out;
+		}
+	}
 	atomic_inc(&root->log_batch);
 
 	/*
@@ -1875,27 +1883,20 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	mutex_unlock(&inode->i_mutex);
 
 	if (ret != BTRFS_NO_LOG_SYNC) {
-		if (ret > 0) {
-			/*
-			 * If we didn't already wait for ordered extents we need
-			 * to do that now.
-			 */
-			if (!full_sync)
-				btrfs_wait_ordered_range(inode, start,
-							 end - start + 1);
-			ret = btrfs_commit_transaction(trans, root);
-		} else {
+		if (!ret) {
 			ret = btrfs_sync_log(trans, root);
-			if (ret == 0) {
+			if (!ret) {
 				ret = btrfs_end_transaction(trans, root);
-			} else {
-				if (!full_sync)
-					btrfs_wait_ordered_range(inode, start,
-								 end -
-								 start + 1);
-				ret = btrfs_commit_transaction(trans, root);
+				goto out;
 			}
 		}
+		if (!full_sync) {
+			ret = btrfs_wait_ordered_range(inode, start,
+						       end - start + 1);
+			if (ret)
+				goto out;
+		}
+		ret = btrfs_commit_transaction(trans, root);
 	} else {
 		ret = btrfs_end_transaction(trans, root);
 	}
@@ -2066,7 +2067,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
 			  ((offset + len - 1) >> PAGE_CACHE_SHIFT));
 
-	btrfs_wait_ordered_range(inode, offset, len);
+	ret = btrfs_wait_ordered_range(inode, offset, len);
+	if (ret)
+		return ret;
 
 	mutex_lock(&inode->i_mutex);
 	/*
@@ -2135,8 +2138,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 			btrfs_put_ordered_extent(ordered);
 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
 				     lockend, &cached_state, GFP_NOFS);
-		btrfs_wait_ordered_range(inode, lockstart,
-					 lockend - lockstart + 1);
+		ret = btrfs_wait_ordered_range(inode, lockstart,
+					       lockend - lockstart + 1);
+		if (ret) {
+			mutex_unlock(&inode->i_mutex);
+			return ret;
+		}
 	}
 
 	path = btrfs_alloc_path();
@@ -2307,7 +2314,10 @@ static long btrfs_fallocate(struct file *file, int mode,
 	 * wait for ordered IO before we have any locks.  We'll loop again
 	 * below with the locks held.
 	 */
-	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+	ret = btrfs_wait_ordered_range(inode, alloc_start,
+				       alloc_end - alloc_start);
+	if (ret)
+		goto out;
 
 	locked_end = alloc_end - 1;
 	while (1) {
@@ -2331,8 +2341,10 @@ static long btrfs_fallocate(struct file *file, int mode,
 			 * we can't wait on the range with the transaction
 			 * running or with the extent lock held
 			 */
-			btrfs_wait_ordered_range(inode, alloc_start,
-						 alloc_end - alloc_start);
+			ret = btrfs_wait_ordered_range(inode, alloc_start,
+						       alloc_end - alloc_start);
+			if (ret)
+				goto out;
 		} else {
 			if (ordered)
 				btrfs_put_ordered_extent(ordered);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4772f3a8e7e5..d7c445c30a16 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1008,8 +1008,13 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	if (ret)
 		goto out;
 
-
-	btrfs_wait_ordered_range(inode, 0, (u64)-1);
+	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+	if (ret) {
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
+				 GFP_NOFS);
+		goto out;
+	}
 
 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 	key.offset = offset;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 14c6ab74cf8c..f1fbf903bf9b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7236,7 +7236,9 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 	 * outstanding dirty pages are on disk.
 	 */
 	count = iov_length(iov, nr_segs);
-	btrfs_wait_ordered_range(inode, offset, count);
+	ret = btrfs_wait_ordered_range(inode, offset, count);
+	if (ret)
+		return ret;
 
 	if (rw & WRITE) {
 		/*
@@ -7577,7 +7579,10 @@ static int btrfs_truncate(struct inode *inode)
 	u64 mask = root->sectorsize - 1;
 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 
-	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
+	ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
+				       (u64)-1);
+	if (ret)
+		return ret;
 
 	/*
 	 * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 1a36a0c3ee27..bbb1a3864686 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -734,8 +734,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
 /*
  * Used to wait on ordered extents across a large range of bytes.
  */
-void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 {
+	int ret = 0;
 	u64 end;
 	u64 orig_end;
 	struct btrfs_ordered_extent *ordered;
@@ -751,8 +752,9 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 	/* start IO across the range first to instantiate any delalloc
 	 * extents
 	 */
-	filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
-
+	ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
+	if (ret)
+		return ret;
 	/*
 	 * So with compression we will find and lock a dirty page and clear the
 	 * first one as dirty, setup an async extent, and immediately return
@@ -768,10 +770,15 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 	 * right and you are wrong.
 	 */
 	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-		     &BTRFS_I(inode)->runtime_flags))
-		filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
-
-	filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+		     &BTRFS_I(inode)->runtime_flags)) {
+		ret = filemap_fdatawrite_range(inode->i_mapping, start,
+					       orig_end);
+		if (ret)
+			return ret;
+	}
+	ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+	if (ret)
+		return ret;
 
 	end = orig_end;
 	while (1) {
@@ -788,11 +795,14 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 		}
 		btrfs_start_ordered_extent(inode, ordered, 1);
 		end = ordered->file_offset;
+		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
+			ret = -EIO;
 		btrfs_put_ordered_extent(ordered);
-		if (end == 0 || end == start)
+		if (ret || end == 0 || end == start)
 			break;
 		end--;
 	}
+	return ret;
 }
 
 /*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 0c0b35612d7a..3982db1c1b36 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -180,7 +180,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
 							 u64 file_offset);
 void btrfs_start_ordered_extent(struct inode *inode,
 				struct btrfs_ordered_extent *entry, int wait);
-void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
 struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
 struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index a945374993f7..729c91e151a3 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4257,7 +4257,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 			rc->extents_found);
 
 		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
-			btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
+			ret = btrfs_wait_ordered_range(rc->data_inode, 0,
+						       (u64)-1);
+			if (ret) {
+				err = ret;
+				goto out;
+			}
 			invalidate_mapping_pages(rc->data_inode->i_mapping,
 						 0, -1);
 			rc->stage = UPDATE_DATA_PTRS;
-- 
cgit v1.3


From 02ecd2c278471a3a7fd255662df92861c6759ebf Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 25 Oct 2013 16:19:08 -0400
Subject: Btrfs: do not bug_on if we try to cow a free space cache inode

We can just return an error and we'll bail out properly.  We still want to catch
this case to make sure we don't have a bug somewhere, so just warn if this pops
up.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/inode.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f1fbf903bf9b..a44ca6a4feff 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -843,7 +843,10 @@ static noinline int cow_file_range(struct inode *inode,
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	int ret = 0;
 
-	BUG_ON(btrfs_is_free_space_inode(inode));
+	if (btrfs_is_free_space_inode(inode)) {
+		WARN_ON_ONCE(1);
+		return -EINVAL;
+	}
 
 	num_bytes = ALIGN(end - start + 1, blocksize);
 	num_bytes = max(blocksize,  num_bytes);
-- 
cgit v1.3


From d788a34929a90f70b945c100cfe9efd4d49fb446 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 25 Oct 2013 16:55:08 -0400
Subject: Btrfs: don't abort transaction in run_delalloc_nocow

This is just the write path, the only reason we start a transaction is so we can
check cross references, we don't make any actual changes, so there is no reason
to abort the transaction if we fail.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/inode.c | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a44ca6a4feff..5916ad8681cd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1180,10 +1180,8 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 	while (1) {
 		ret = btrfs_lookup_file_extent(trans, root, path, ino,
 					       cur_offset, 0);
-		if (ret < 0) {
-			btrfs_abort_transaction(trans, root, ret);
+		if (ret < 0)
 			goto error;
-		}
 		if (ret > 0 && path->slots[0] > 0 && check_prev) {
 			leaf = path->nodes[0];
 			btrfs_item_key_to_cpu(leaf, &found_key,
@@ -1197,10 +1195,8 @@ next_slot:
 		leaf = path->nodes[0];
 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 			ret = btrfs_next_leaf(root, path);
-			if (ret < 0) {
-				btrfs_abort_transaction(trans, root, ret);
+			if (ret < 0)
 				goto error;
-			}
 			if (ret > 0)
 				break;
 			leaf = path->nodes[0];
@@ -1291,10 +1287,8 @@ out_check:
 			ret = cow_file_range(inode, locked_page,
 					     cow_start, found_key.offset - 1,
 					     page_started, nr_written, 1);
-			if (ret) {
-				btrfs_abort_transaction(trans, root, ret);
+			if (ret)
 				goto error;
-			}
 			cow_start = (u64)-1;
 		}
 
@@ -1341,10 +1335,8 @@ out_check:
 		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
 			ret = btrfs_reloc_clone_csums(inode, cur_offset,
 						      num_bytes);
-			if (ret) {
-				btrfs_abort_transaction(trans, root, ret);
+			if (ret)
 				goto error;
-			}
 		}
 
 		extent_clear_unlock_delalloc(inode, cur_offset,
@@ -1366,10 +1358,8 @@ out_check:
 	if (cow_start != (u64)-1) {
 		ret = cow_file_range(inode, locked_page, cow_start, end,
 				     page_started, nr_written, 1);
-		if (ret) {
-			btrfs_abort_transaction(trans, root, ret);
+		if (ret)
 			goto error;
-		}
 	}
 
 error:
-- 
cgit v1.3


From 93858769172c4e3678917810e9d5de360eb991cc Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 28 Oct 2013 09:13:25 -0400
Subject: Btrfs: take ordered root lock when removing ordered operations inode

A user reported a list corruption warning from btrfs_remove_ordered_extent, it
is because we aren't taking the ordered_root_lock when we remove the inode from
the ordered operations list.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ordered-data.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index bbb1a3864686..8a5eff366596 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -537,7 +537,9 @@ void btrfs_remove_ordered_extent(struct inode *inode,
 	 */
 	if (RB_EMPTY_ROOT(&tree->tree) &&
 	    !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
+		spin_lock(&root->fs_info->ordered_root_lock);
 		list_del_init(&BTRFS_I(inode)->ordered_operations);
+		spin_unlock(&root->fs_info->ordered_root_lock);
 	}
 
 	if (!root->nr_ordered_extents) {
-- 
cgit v1.3


From 9f23e289edaf1e99f60de3978c50a1c7424f3e92 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 28 Oct 2013 15:03:41 -0400
Subject: Btrfs: make sure the delalloc workers actually flush compressed
 writes

When using delalloc workers in a non-waiting way (like for enospc handling) we
can end up not actually waiting for the dirty pages to be started if we have
compression.  We need to add an extra filemap flush to make sure any async
extents that have started are actually moved along before returning.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/inode.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5916ad8681cd..c84adde53023 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8163,18 +8163,24 @@ out_notrans:
 static void btrfs_run_delalloc_work(struct btrfs_work *work)
 {
 	struct btrfs_delalloc_work *delalloc_work;
+	struct inode *inode;
 
 	delalloc_work = container_of(work, struct btrfs_delalloc_work,
 				     work);
-	if (delalloc_work->wait)
-		btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1);
-	else
-		filemap_flush(delalloc_work->inode->i_mapping);
+	inode = delalloc_work->inode;
+	if (delalloc_work->wait) {
+		btrfs_wait_ordered_range(inode, 0, (u64)-1);
+	} else {
+		filemap_flush(inode->i_mapping);
+		if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+			     &BTRFS_I(inode)->runtime_flags))
+			filemap_flush(inode->i_mapping);
+	}
 
 	if (delalloc_work->delay_iput)
-		btrfs_add_delayed_iput(delalloc_work->inode);
+		btrfs_add_delayed_iput(inode);
 	else
-		iput(delalloc_work->inode);
+		iput(inode);
 	complete(&delalloc_work->completion);
 }
 
-- 
cgit v1.3


From 9e6a0c52b74b2d63a6cdb09cec5eaf66038b218f Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Thu, 31 Oct 2013 10:07:19 -0400
Subject: Btrfs: stop committing the transaction so much during relocate

I noticed with my horrible snapshot excercisor that we were taking forever to
relocate the larger the file system got.  This appeared to be because we were
committing the transaction _constantly_.  There were a few places where we do
braindead things with metadata reservation, like start a transaction and then
try to refill the block rsv, which not only keeps us from committing a
transaction during the enospc stuff, but keeps us from doing some of the harder
flushing work which will make us more likely to need to commit the transaction.
We also were checking the block rsv and committing the transaction if the block
rsv was below a certain threshold, but we were doing this in a place where we
don't actually keep anything in the block rsv so this was always ending up false
so we always committed the transaction in this case.  I tested this to make sure
it didn't break anything, but it takes about 10 hours to get the box to this
state so I don't know how much of an impact it will really make.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/relocation.c | 33 +++++++++++++--------------------
 1 file changed, 13 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 729c91e151a3..92eb4832ff86 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2060,7 +2060,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 	LIST_HEAD(inode_list);
 	struct btrfs_key key;
 	struct btrfs_key next_key;
-	struct btrfs_trans_handle *trans;
+	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_root *reloc_root;
 	struct btrfs_root_item *root_item;
 	struct btrfs_path *path;
@@ -2109,18 +2109,19 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 	memset(&next_key, 0, sizeof(next_key));
 
 	while (1) {
-		trans = btrfs_start_transaction(root, 0);
-		BUG_ON(IS_ERR(trans));
-		trans->block_rsv = rc->block_rsv;
-
 		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
 					     BTRFS_RESERVE_FLUSH_ALL);
 		if (ret) {
-			BUG_ON(ret != -EAGAIN);
-			ret = btrfs_commit_transaction(trans, root);
-			BUG_ON(ret);
-			continue;
+			err = ret;
+			goto out;
+		}
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans)) {
+			err = PTR_ERR(trans);
+			trans = NULL;
+			goto out;
 		}
+		trans->block_rsv = rc->block_rsv;
 
 		replaced = 0;
 		max_level = level;
@@ -2166,6 +2167,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 		root_item->drop_level = level;
 
 		btrfs_end_transaction_throttle(trans, root);
+		trans = NULL;
 
 		btrfs_btree_balance_dirty(root);
 
@@ -2194,7 +2196,8 @@ out:
 		btrfs_update_reloc_root(trans, root);
 	}
 
-	btrfs_end_transaction_throttle(trans, root);
+	if (trans)
+		btrfs_end_transaction_throttle(trans, root);
 
 	btrfs_btree_balance_dirty(root);
 
@@ -3994,16 +3997,6 @@ restart:
 			}
 		}
 
-		ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
-		if (ret < 0) {
-			if (ret != -ENOSPC) {
-				err = ret;
-				WARN_ON(1);
-				break;
-			}
-			rc->commit_transaction = 1;
-		}
-
 		if (rc->commit_transaction) {
 			rc->commit_transaction = 0;
 			ret = btrfs_commit_transaction(trans, rc->extent_root);
-- 
cgit v1.3


From 9dced186f934648bb7796ad3301cfc2563e2ad6e Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 25 Oct 2013 17:33:36 +0800
Subject: Btrfs: fix the free space write out failure when there is no data
 space

After running space balance on a new fs, the fs check program outputed the
following warning message:
 free space inode generation (0) did not match free space cache generation (20)

Steps to reproduce:
 # mkfs.btrfs -f <dev>
 # mount <dev> <mnt>
 # btrfs balance start <mnt>
 # umount <mnt>
 # btrfs check <dev>

It was because there was no data space after the space balance, and the free
space write out task didn't try to allocate a new data chunk for the free space
inode when doing the reservation. So the data space reservation failed, and in
order to tell the free space loader that this free space inode could not be
trusted, the generation of the free space inode wasn't updated. Then the check
program found this problem and outputed the above message.

But in fact, it is safe that we try to allocate a new data chunk when we find
the data space is not enough. The patch fixes the above problem by this way.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4595a65db729..3aa52701d04a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3621,10 +3621,9 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 	/* make sure bytes are sectorsize aligned */
 	bytes = ALIGN(bytes, root->sectorsize);
 
-	if (root == root->fs_info->tree_root ||
-	    BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
-		alloc_chunk = 0;
+	if (btrfs_is_free_space_inode(inode)) {
 		committed = 1;
+		ASSERT(current->journal_info);
 	}
 
 	data_sinfo = fs_info->data_sinfo;
@@ -3652,6 +3651,16 @@ again:
 			spin_unlock(&data_sinfo->lock);
 alloc:
 			alloc_target = btrfs_get_alloc_profile(root, 1);
+			/*
+			 * It is ugly that we don't call nolock join
+			 * transaction for the free space inode case here.
+			 * But it is safe because we only do the data space
+			 * reservation for the free space cache in the
+			 * transaction context, the common join transaction
+			 * just increase the counter of the current transaction
+			 * handler, doesn't try to acquire the trans_lock of
+			 * the fs.
+			 */
 			trans = btrfs_join_transaction(root);
 			if (IS_ERR(trans))
 				return PTR_ERR(trans);
-- 
cgit v1.3


From 7fdf4b608dda5eea114cb23623b52e34dd5972f5 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Fri, 25 Oct 2013 18:52:08 +0800
Subject: Btrfs: use 'u64' rather than 'int' to get extent's generation

We define a 'int' to get extent's generation by mistake,fix it.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/relocation.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 92eb4832ff86..a5f6a8061b21 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3263,7 +3263,7 @@ static int add_tree_block(struct reloc_control *rc,
 	struct rb_node *rb_node;
 	u32 item_size;
 	int level = -1;
-	int generation;
+	u64 generation;
 
 	eb =  path->nodes[0];
 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
-- 
cgit v1.3


From 9b011adfe14977fcda977234609d43ca52463a3d Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Fri, 25 Oct 2013 19:12:02 +0800
Subject: Btrfs: remove scrub_super_lock holding in btrfs_sync_log()

Originally, we introduced scrub_super_lock to synchronize
tree log code with scrubbing super.

However we can replace scrub_super_lock with device_list_mutex,
because writing super will hold this mutex, this will reduce an extra
lock holding when writing supers in sync log code.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ctree.h    |  3 ---
 fs/btrfs/disk-io.c  |  1 -
 fs/btrfs/scrub.c    | 18 +++++-------------
 fs/btrfs/tree-log.c |  2 --
 fs/btrfs/volumes.h  |  4 +++-
 5 files changed, 8 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9f5e1cfb0209..34279665fd69 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1586,7 +1586,6 @@ struct btrfs_fs_info {
 	atomic_t scrubs_paused;
 	atomic_t scrub_cancel_req;
 	wait_queue_head_t scrub_pause_wait;
-	struct rw_semaphore scrub_super_lock;
 	int scrub_workers_refcnt;
 	struct btrfs_workers scrub_workers;
 	struct btrfs_workers scrub_wr_completion_workers;
@@ -3950,9 +3949,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		    u64 end, struct btrfs_scrub_progress *progress,
 		    int readonly, int is_dev_replace);
 void btrfs_scrub_pause(struct btrfs_root *root);
-void btrfs_scrub_pause_super(struct btrfs_root *root);
 void btrfs_scrub_continue(struct btrfs_root *root);
-void btrfs_scrub_continue_super(struct btrfs_root *root);
 int btrfs_scrub_cancel(struct btrfs_fs_info *info);
 int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
 			   struct btrfs_device *dev);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 62c4aba221bb..a3ad3a27944d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2250,7 +2250,6 @@ int open_ctree(struct super_block *sb,
 	atomic_set(&fs_info->scrubs_paused, 0);
 	atomic_set(&fs_info->scrub_cancel_req, 0);
 	init_waitqueue_head(&fs_info->scrub_pause_wait);
-	init_rwsem(&fs_info->scrub_super_lock);
 	fs_info->scrub_workers_refcnt = 0;
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	fs_info->check_integrity_print_mask = 0;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f21e2df89bc2..12009b4279ad 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2932,13 +2932,15 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 
 	atomic_inc(&fs_info->scrubs_running);
 	mutex_unlock(&fs_info->scrub_lock);
-	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 
 	if (!is_dev_replace) {
-		down_read(&fs_info->scrub_super_lock);
+		/*
+		 * by holding device list mutex, we can
+		 * kick off writing super in log tree sync.
+		 */
 		ret = scrub_supers(sctx, dev);
-		up_read(&fs_info->scrub_super_lock);
 	}
+	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 
 	if (!ret)
 		ret = scrub_enumerate_chunks(sctx, dev, start, end,
@@ -2988,16 +2990,6 @@ void btrfs_scrub_continue(struct btrfs_root *root)
 	wake_up(&fs_info->scrub_pause_wait);
 }
 
-void btrfs_scrub_pause_super(struct btrfs_root *root)
-{
-	down_write(&root->fs_info->scrub_super_lock);
-}
-
-void btrfs_scrub_continue_super(struct btrfs_root *root)
-{
-	up_write(&root->fs_info->scrub_super_lock);
-}
-
 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
 {
 	mutex_lock(&fs_info->scrub_lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f98002ed32b1..257cbae8c948 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2586,9 +2586,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 * the running transaction open, so a full commit can't hop
 	 * in and cause problems either.
 	 */
-	btrfs_scrub_pause_super(root);
 	ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
-	btrfs_scrub_continue_super(root);
 	if (ret) {
 		btrfs_abort_transaction(trans, root, ret);
 		goto out_wake_log_root;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index b72f540c8b29..49e0ff01143f 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -132,7 +132,9 @@ struct btrfs_fs_devices {
 
 	/* all of the devices in the FS, protected by a mutex
 	 * so we can safely walk it to write out the supers without
-	 * worrying about add/remove by the multi-device code
+	 * worrying about add/remove by the multi-device code.
+	 * Scrubbing super can kick off supers writing by holding
+	 * this mutex lock.
 	 */
 	struct mutex device_list_mutex;
 	struct list_head devices;
-- 
cgit v1.3


From e46f5388cdde0f402f2dded6f1cad614536c6429 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 28 Oct 2013 16:28:30 +0000
Subject: Btrfs: fix verification of dir_item

We were ignoring the name component of the dir_item. Both the
name and data must fit within BTRFS_MAX_XATTR_SIZE(root).

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/dir-item.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 1c529db1697f..c031ea3fd70f 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -474,8 +474,10 @@ int verify_dir_item(struct btrfs_root *root,
 	}
 
 	/* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
-	if (btrfs_dir_data_len(leaf, dir_item) > BTRFS_MAX_XATTR_SIZE(root)) {
-		printk(KERN_CRIT "btrfs: invalid dir item data len: %u\n",
+	if ((btrfs_dir_data_len(leaf, dir_item) +
+	     btrfs_dir_name_len(leaf, dir_item)) > BTRFS_MAX_XATTR_SIZE(root)) {
+		printk(KERN_CRIT "btrfs: invalid dir item name + data len: %u + %u\n",
+		       (unsigned)btrfs_dir_name_len(leaf, dir_item),
 		       (unsigned)btrfs_dir_data_len(leaf, dir_item));
 		return 1;
 	}
-- 
cgit v1.3


From 488111aa0e17102d42e0328299fd782dc6a4a051 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 28 Oct 2013 16:30:29 +0000
Subject: Btrfs: fix csum search offset/length calculation in log tree

We were setting the csums search offset and length to the right values if
the extent is compressed, but later on right before doing the csums lookup
we were overriding these two parameters regardless of compression being
set or not for the extent.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tree-log.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 257cbae8c948..18891c009641 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3421,11 +3421,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	if (skip_csum)
 		return 0;
 
-	if (em->compress_type) {
-		csum_offset = 0;
-		csum_len = block_len;
-	}
-
 	/*
 	 * First check and see if our csums are on our outstanding ordered
 	 * extents.
@@ -3509,8 +3504,13 @@ unlocked:
 	if (!mod_len || ret)
 		return ret;
 
-	csum_offset = mod_start - em->start;
-	csum_len = mod_len;
+	if (em->compress_type) {
+		csum_offset = 0;
+		csum_len = block_len;
+	} else {
+		csum_offset = mod_start - em->start;
+		csum_len = mod_len;
+	}
 
 	/* block start is already adjusted for the file extent offset. */
 	ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
-- 
cgit v1.3


From 269d040ff2f340e1ed517dab8b66b2f133c83a77 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 28 Oct 2013 17:39:21 +0000
Subject: Btrfs: log recovery, don't unlink inode always on error

If we get any error while doing a dir index/item lookup in the
log tree, we were always unlinking the corresponding inode in
the subvolume. It makes sense to unlink only if the lookup failed
to find the dir index/item, which corresponds to NULL or -ENOENT,
and not when other errors happen (like a transient -ENOMEM or -EIO).

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tree-log.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 18891c009641..7927a5fa755c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1832,7 +1832,7 @@ again:
 						     dir_key->offset,
 						     name, name_len, 0);
 		}
-		if (IS_ERR_OR_NULL(log_di)) {
+		if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
 			btrfs_dir_item_key_to_cpu(eb, di, &location);
 			btrfs_release_path(path);
 			btrfs_release_path(log_path);
@@ -1869,6 +1869,9 @@ again:
 				goto again;
 			ret = 0;
 			goto out;
+		} else if (IS_ERR(log_di)) {
+			kfree(name);
+			return PTR_ERR(log_di);
 		}
 		btrfs_release_path(log_path);
 		kfree(name);
-- 
cgit v1.3


From 6f519564d7d978c00351d9ab6abac3deeac31621 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 29 Oct 2013 10:45:05 +0800
Subject: Btrfs: do not run snapshot-aware defragment on error

If something wrong happens in write endio, running snapshot-aware defragment
can end up with undefined results, maybe a crash, so we should avoid it.

In order to share similar code, this also adds a helper to free the struct for
snapshot-aware defrag.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/inode.c | 47 ++++++++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c84adde53023..17800221978f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2365,10 +2365,23 @@ out_unlock:
 	return ret;
 }
 
+static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
+{
+	struct old_sa_defrag_extent *old, *tmp;
+
+	if (!new)
+		return;
+
+	list_for_each_entry_safe(old, tmp, &new->head, list) {
+		list_del(&old->list);
+		kfree(old);
+	}
+	kfree(new);
+}
+
 static void relink_file_extents(struct new_sa_defrag_extent *new)
 {
 	struct btrfs_path *path;
-	struct old_sa_defrag_extent *old, *tmp;
 	struct sa_defrag_extent_backref *backref;
 	struct sa_defrag_extent_backref *prev = NULL;
 	struct inode *inode;
@@ -2411,16 +2424,11 @@ static void relink_file_extents(struct new_sa_defrag_extent *new)
 	kfree(prev);
 
 	btrfs_free_path(path);
-
-	list_for_each_entry_safe(old, tmp, &new->head, list) {
-		list_del(&old->list);
-		kfree(old);
-	}
 out:
+	free_sa_defrag_extent(new);
+
 	atomic_dec(&root->fs_info->defrag_running);
 	wake_up(&root->fs_info->transaction_wait);
-
-	kfree(new);
 }
 
 static struct new_sa_defrag_extent *
@@ -2430,7 +2438,7 @@ record_old_file_extents(struct inode *inode,
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_path *path;
 	struct btrfs_key key;
-	struct old_sa_defrag_extent *old, *tmp;
+	struct old_sa_defrag_extent *old;
 	struct new_sa_defrag_extent *new;
 	int ret;
 
@@ -2478,7 +2486,7 @@ record_old_file_extents(struct inode *inode,
 		if (slot >= btrfs_header_nritems(l)) {
 			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
-				goto out_free_list;
+				goto out_free_path;
 			else if (ret > 0)
 				break;
 			continue;
@@ -2507,7 +2515,7 @@ record_old_file_extents(struct inode *inode,
 
 		old = kmalloc(sizeof(*old), GFP_NOFS);
 		if (!old)
-			goto out_free_list;
+			goto out_free_path;
 
 		offset = max(new->file_pos, key.offset);
 		end = min(new->file_pos + new->len, key.offset + num_bytes);
@@ -2529,15 +2537,10 @@ next:
 
 	return new;
 
-out_free_list:
-	list_for_each_entry_safe(old, tmp, &new->head, list) {
-		list_del(&old->list);
-		kfree(old);
-	}
 out_free_path:
 	btrfs_free_path(path);
 out_kfree:
-	kfree(new);
+	free_sa_defrag_extent(new);
 	return NULL;
 }
 
@@ -2708,8 +2711,14 @@ out:
 	btrfs_remove_ordered_extent(inode, ordered_extent);
 
 	/* for snapshot-aware defrag */
-	if (new)
-		relink_file_extents(new);
+	if (new) {
+		if (ret) {
+			free_sa_defrag_extent(new);
+			atomic_dec(&root->fs_info->defrag_running);
+		} else {
+			relink_file_extents(new);
+		}
+	}
 
 	/* once for us */
 	btrfs_put_ordered_extent(ordered_extent);
-- 
cgit v1.3


From 48ec47364b6d493f0a9cdc116977bf3f34e5c3ec Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 30 Oct 2013 13:25:24 +0800
Subject: Btrfs: fix a crash when running balance and defrag concurrently

Running balance and defrag concurrently can end up with a crash:

kernel BUG at fs/btrfs/relocation.c:4528!
RIP: 0010:[<ffffffffa01ac33b>]  [<ffffffffa01ac33b>] btrfs_reloc_cow_block+ 0x1eb/0x230 [btrfs]
Call Trace:
  [<ffffffffa01398c1>] ? update_ref_for_cow+0x241/0x380 [btrfs]
  [<ffffffffa0180bad>] ? copy_extent_buffer+0xad/0x110 [btrfs]
  [<ffffffffa0139da1>] __btrfs_cow_block+0x3a1/0x520 [btrfs]
  [<ffffffffa013a0b6>] btrfs_cow_block+0x116/0x1b0 [btrfs]
  [<ffffffffa013ddad>] btrfs_search_slot+0x43d/0x970 [btrfs]
  [<ffffffffa0153c57>] btrfs_lookup_file_extent+0x37/0x40 [btrfs]
  [<ffffffffa0172a5e>] __btrfs_drop_extents+0x11e/0xae0 [btrfs]
  [<ffffffffa013b3fd>] ? generic_bin_search.constprop.39+0x8d/0x1a0 [btrfs]
  [<ffffffff8117d14a>] ? kmem_cache_alloc+0x1da/0x200
  [<ffffffffa0138e7a>] ? btrfs_alloc_path+0x1a/0x20 [btrfs]
  [<ffffffffa0173ef0>] btrfs_drop_extents+0x60/0x90 [btrfs]
  [<ffffffffa016b24d>] relink_extent_backref+0x2ed/0x780 [btrfs]
  [<ffffffffa0162fe0>] ? btrfs_submit_bio_hook+0x1e0/0x1e0 [btrfs]
  [<ffffffffa01b8ed7>] ? iterate_inodes_from_logical+0x87/0xa0 [btrfs]
  [<ffffffffa016b909>] btrfs_finish_ordered_io+0x229/0xac0 [btrfs]
  [<ffffffffa016c3b5>] finish_ordered_fn+0x15/0x20 [btrfs]
  [<ffffffffa018cbe5>] worker_loop+0x125/0x4e0 [btrfs]
  [<ffffffffa018cac0>] ? btrfs_queue_worker+0x300/0x300 [btrfs]
  [<ffffffff81075ea0>] kthread+0xc0/0xd0
  [<ffffffff81075de0>] ? insert_kthread_work+0x40/0x40
  [<ffffffff8164796c>] ret_from_fork+0x7c/0xb0
  [<ffffffff81075de0>] ? insert_kthread_work+0x40/0x40
----------------------------------------------------------------------

It turns out to be that balance operation will bump root's @last_snapshot,
which enables snapshot-aware defrag path, and backref walking stuff will
find data reloc tree as refs' parent, and hit the BUG_ON() during COW.

As data reloc tree's data is just for relocation purpose, and will be deleted right
after relocation is done, it's unnecessary to walk those refs belonged to data reloc
tree, it'd be better to skip them.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/backref.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 721936a5b2ca..30d24cfb45d4 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -185,6 +185,9 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
 {
 	struct __prelim_ref *ref;
 
+	if (root_id == BTRFS_DATA_RELOC_TREE_OBJECTID)
+		return 0;
+
 	ref = kmem_cache_alloc(btrfs_prelim_ref_cache, gfp_mask);
 	if (!ref)
 		return -ENOMEM;
-- 
cgit v1.3


From 2e9f5954978cd5b0c26e6eeb9fd4ccbdce0f5ecc Mon Sep 17 00:00:00 2001
From: Rashika <rashika.kheria@gmail.com>
Date: Thu, 31 Oct 2013 02:45:20 +0530
Subject: btrfs: Add helper function for free_root_pointers()

The function free_root_pointers() in disk-io.h contains redundant code.
Therefore, this patch adds a helper function free_root_extent_buffers()
to free_root_pointers() to eliminate redundancy.

Reviewed-by: Zach Brown <zab@redhat.com>
Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/disk-io.c | 60 +++++++++++++++++-------------------------------------
 1 file changed, 19 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a3ad3a27944d..22443fac9408 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2033,50 +2033,28 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 	btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
 }
 
+static void free_root_extent_buffers(struct btrfs_root *root)
+{
+	if (root) {
+		free_extent_buffer(root->node);
+		free_extent_buffer(root->commit_root);
+		root->node = NULL;
+		root->commit_root = NULL;
+	}
+}
+
 /* helper to cleanup tree roots */
 static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
 {
-	free_extent_buffer(info->tree_root->node);
-	free_extent_buffer(info->tree_root->commit_root);
-	info->tree_root->node = NULL;
-	info->tree_root->commit_root = NULL;
-
-	if (info->dev_root) {
-		free_extent_buffer(info->dev_root->node);
-		free_extent_buffer(info->dev_root->commit_root);
-		info->dev_root->node = NULL;
-		info->dev_root->commit_root = NULL;
-	}
-	if (info->extent_root) {
-		free_extent_buffer(info->extent_root->node);
-		free_extent_buffer(info->extent_root->commit_root);
-		info->extent_root->node = NULL;
-		info->extent_root->commit_root = NULL;
-	}
-	if (info->csum_root) {
-		free_extent_buffer(info->csum_root->node);
-		free_extent_buffer(info->csum_root->commit_root);
-		info->csum_root->node = NULL;
-		info->csum_root->commit_root = NULL;
-	}
-	if (info->quota_root) {
-		free_extent_buffer(info->quota_root->node);
-		free_extent_buffer(info->quota_root->commit_root);
-		info->quota_root->node = NULL;
-		info->quota_root->commit_root = NULL;
-	}
-	if (info->uuid_root) {
-		free_extent_buffer(info->uuid_root->node);
-		free_extent_buffer(info->uuid_root->commit_root);
-		info->uuid_root->node = NULL;
-		info->uuid_root->commit_root = NULL;
-	}
-	if (chunk_root) {
-		free_extent_buffer(info->chunk_root->node);
-		free_extent_buffer(info->chunk_root->commit_root);
-		info->chunk_root->node = NULL;
-		info->chunk_root->commit_root = NULL;
-	}
+	free_root_extent_buffers(info->tree_root);
+
+	free_root_extent_buffers(info->dev_root);
+	free_root_extent_buffers(info->extent_root);
+	free_root_extent_buffers(info->csum_root);
+	free_root_extent_buffers(info->quota_root);
+	free_root_extent_buffers(info->uuid_root);
+	if (chunk_root)
+		free_root_extent_buffers(info->chunk_root);
 }
 
 static void del_fs_roots(struct btrfs_fs_info *fs_info)
-- 
cgit v1.3


From 95e94d14b42c1992493ce72492351d4601b274fe Mon Sep 17 00:00:00 2001
From: Rashika <rashika.kheria@gmail.com>
Date: Thu, 31 Oct 2013 03:12:42 +0530
Subject: btrfs: Replace multiple atomic_inc() with atomic_add()

This patch replaces multiple atomic_inc() with atomic_add() in
delayed-inode.c to reduce source code and have few instructions
for compilation.

Reviewed-by: Zach Brown <zab@redhat.com>
Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/delayed-inode.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index df1a496bc8d4..af2ecca8a109 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -108,8 +108,8 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode)
 			return node;
 		}
 		btrfs_inode->delayed_node = node;
-		atomic_inc(&node->refs);	/* can be accessed */
-		atomic_inc(&node->refs);	/* cached in the inode */
+		/* can be accessed and cached in the inode */
+		atomic_add(2, &node->refs);
 		spin_unlock(&root->inode_lock);
 		return node;
 	}
@@ -138,8 +138,8 @@ again:
 		return ERR_PTR(-ENOMEM);
 	btrfs_init_delayed_node(node, root, ino);
 
-	atomic_inc(&node->refs);	/* cached in the btrfs inode */
-	atomic_inc(&node->refs);	/* can be accessed */
+	/* cached in the btrfs inode and can be accessed */
+	atomic_add(2, &node->refs);
 
 	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
 	if (ret) {
-- 
cgit v1.3


From 3c45bfc1528ccb8b94c06d5740854bb75f8a92fb Mon Sep 17 00:00:00 2001
From: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Date: Thu, 31 Oct 2013 09:57:33 +0530
Subject: btrfs: Pack struct btrfs_device

Pack the structure btrfs_device in volumes.h to eliminate holes detected
by pahole, thus reducing binary memory footprint.

Signed-off-by: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Reviewed-by: Zach Brown <zab@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/volumes.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 49e0ff01143f..8b3cd142b373 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -43,9 +43,8 @@ struct btrfs_device {
 	/* WRITE_SYNC bios */
 	struct btrfs_pending_bios pending_sync_bios;
 
-	int running_pending;
 	u64 generation;
-
+	int running_pending;
 	int writeable;
 	int in_fs_metadata;
 	int missing;
@@ -53,11 +52,11 @@ struct btrfs_device {
 	int is_tgtdev_for_dev_replace;
 
 	spinlock_t io_lock;
+	/* the mode sent to blkdev_get */
+	fmode_t mode;
 
 	struct block_device *bdev;
 
-	/* the mode sent to blkdev_get */
-	fmode_t mode;
 
 	struct rcu_string *name;
 
@@ -78,16 +77,21 @@ struct btrfs_device {
 
 	/* optimal io width for this device */
 	u32 io_width;
+	/* type and info about this device */
+	u64 type;
 
 	/* minimal io size for this device */
 	u32 sector_size;
 
-	/* type and info about this device */
-	u64 type;
 
 	/* physical drive uuid (or lvm uuid) */
 	u8 uuid[BTRFS_UUID_SIZE];
 
+	/* for sending down flush barriers */
+	int nobarriers;
+	struct bio *flush_bio;
+	struct completion flush_wait;
+
 	/* per-device scrub information */
 	struct scrub_ctx *scrub_device;
 
@@ -103,10 +107,6 @@ struct btrfs_device {
 	struct radix_tree_root reada_zones;
 	struct radix_tree_root reada_extents;
 
-	/* for sending down flush barriers */
-	struct bio *flush_bio;
-	struct completion flush_wait;
-	int nobarriers;
 
 	/* disk I/O failure stats. For detailed description refer to
 	 * enum btrfs_dev_stat_values in ioctl.h */
-- 
cgit v1.3


From b19e684393752ec129a36924e6c270daf79676d6 Mon Sep 17 00:00:00 2001
From: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Date: Thu, 31 Oct 2013 10:29:38 +0530
Subject: btrfs: Remove redundant local zero structure

Remove redundant local zero structure, replacing it by the kernel's
global ZERO_PAGE.

Signed-off-by: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Reviewed-by: Zach Brown <zab@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ioctl.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9ea05506cb84..78d20482b72a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -368,9 +368,8 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 
 int btrfs_is_empty_uuid(u8 *uuid)
 {
-	static char empty_uuid[BTRFS_UUID_SIZE] = {0};
-
-	return !memcmp(uuid, empty_uuid, BTRFS_UUID_SIZE);
+	BUILD_BUG_ON(BTRFS_UUID_SIZE > PAGE_SIZE);
+	return !memcmp(uuid, empty_zero_page, BTRFS_UUID_SIZE);
 }
 
 static noinline int create_subvol(struct inode *dir,
-- 
cgit v1.3


From fae7f21cece9a4c181a8d8131870c7247e153f65 Mon Sep 17 00:00:00 2001
From: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Date: Thu, 31 Oct 2013 10:30:08 +0530
Subject: btrfs: Use WARN_ON()'s return value in place of WARN_ON(1)

Use WARN_ON()'s return value in place of WARN_ON(1) for cleaner source
code that outputs a more descriptive warnings. Also fix the styling
warning of redundant braces that came up as a result of this fix.

Signed-off-by: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Reviewed-by: Zach Brown <zab@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/backref.c         |  3 +--
 fs/btrfs/check-integrity.c |  7 ++-----
 fs/btrfs/ctree.c           |  6 ++----
 fs/btrfs/delayed-inode.c   |  3 +--
 fs/btrfs/disk-io.c         |  8 +-------
 fs/btrfs/extent-tree.c     | 11 ++++-------
 fs/btrfs/extent_io.c       |  7 ++-----
 fs/btrfs/file-item.c       |  4 +---
 fs/btrfs/inode-map.c       |  4 +---
 fs/btrfs/inode.c           | 10 +++-------
 fs/btrfs/relocation.c      | 13 ++++---------
 fs/btrfs/send.c            |  4 +---
 fs/btrfs/tree-log.c        |  7 ++-----
 fs/btrfs/volumes.c         | 11 +++--------
 14 files changed, 28 insertions(+), 70 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 30d24cfb45d4..3775947429b2 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -326,8 +326,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 
 	eb = path->nodes[level];
 	while (!eb) {
-		if (!level) {
-			WARN_ON(1);
+		if (WARN_ON(!level)) {
 			ret = 1;
 			goto out;
 		}
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 656b07637e39..e0aab4456974 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -2464,10 +2464,8 @@ static int btrfsic_process_written_superblock(
 		}
 	}
 
-	if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) {
-		WARN_ON(1);
+	if (WARN_ON(-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)))
 		btrfsic_dump_tree(state);
-	}
 
 	return 0;
 }
@@ -2907,7 +2905,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
 		btrfsic_release_block_ctx(&block_ctx);
 	}
 
-	if (!match) {
+	if (WARN_ON(!match)) {
 		printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
 		       " buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
 		       " phys_bytenr=%llu)!\n",
@@ -2924,7 +2922,6 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
 			       bytenr, block_ctx.dev->name,
 			       block_ctx.dev_bytenr, mirror_num);
 		}
-		WARN_ON(1);
 	}
 }
 
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a749121203df..03c606cefd50 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1285,11 +1285,10 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 		free_extent_buffer(eb_root);
 		blocksize = btrfs_level_size(root, old_root->level);
 		old = read_tree_block(root, logical, blocksize, 0);
-		if (!old || !extent_buffer_uptodate(old)) {
+		if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
 			free_extent_buffer(old);
 			pr_warn("btrfs: failed to read tree block %llu from get_old_root\n",
 				logical);
-			WARN_ON(1);
 		} else {
 			eb = btrfs_clone_extent_buffer(old);
 			free_extent_buffer(old);
@@ -3643,8 +3642,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 		ret = 1;
 		goto out;
 	}
-	if (!empty && push_items == btrfs_header_nritems(right))
-		WARN_ON(1);
+	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
 
 	/* push data from right to left */
 	copy_extent_buffer(left, right,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index af2ecca8a109..2bb8e1a988a1 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -649,14 +649,13 @@ static int btrfs_delayed_inode_reserve_metadata(
 			goto out;
 
 		ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
-		if (!ret)
+		if (!WARN_ON(ret))
 			goto out;
 
 		/*
 		 * Ok this is a problem, let's just steal from the global rsv
 		 * since this really shouldn't happen that often.
 		 */
-		WARN_ON(1);
 		ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
 					      dst_rsv, num_bytes);
 		goto out;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 22443fac9408..77fa2d99252f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -475,14 +475,8 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 	if (page != eb->pages[0])
 		return 0;
 	found_start = btrfs_header_bytenr(eb);
-	if (found_start != start) {
-		WARN_ON(1);
+	if (WARN_ON(found_start != start || !PageUptodate(page)))
 		return 0;
-	}
-	if (!PageUptodate(page)) {
-		WARN_ON(1);
-		return 0;
-	}
 	csum_tree_block(root, eb, 0);
 	return 0;
 }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3aa52701d04a..2d58461bd035 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1550,9 +1550,8 @@ again:
 	if (ret && !insert) {
 		err = -ENOENT;
 		goto out;
-	} else if (ret) {
+	} else if (WARN_ON(ret)) {
 		err = -EIO;
-		WARN_ON(1);
 		goto out;
 	}
 
@@ -5752,9 +5751,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			}
 			extent_slot = path->slots[0];
 		}
-	} else if (ret == -ENOENT) {
+	} else if (WARN_ON(ret == -ENOENT)) {
 		btrfs_print_leaf(extent_root, path->nodes[0]);
-		WARN_ON(1);
 		btrfs_err(info,
 			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
 			bytenr, parent, root_objectid, owner_objectid,
@@ -8317,10 +8315,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 					struct btrfs_space_info,
 					list);
 		if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
-			if (space_info->bytes_pinned > 0 ||
+			if (WARN_ON(space_info->bytes_pinned > 0 ||
 			    space_info->bytes_reserved > 0 ||
-			    space_info->bytes_may_use > 0) {
-				WARN_ON(1);
+			    space_info->bytes_may_use > 0)) {
 				dump_space_info(space_info, 0, 0);
 			}
 		}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a93bab452183..fb782ed62426 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1739,10 +1739,8 @@ u64 count_range_bits(struct extent_io_tree *tree,
 	u64 last = 0;
 	int found = 0;
 
-	if (search_end <= cur_start) {
-		WARN_ON(1);
+	if (WARN_ON(search_end <= cur_start))
 		return 0;
-	}
 
 	spin_lock(&tree->lock);
 	if (cur_start == 0 && bits == EXTENT_DIRTY) {
@@ -3568,9 +3566,8 @@ retry:
 			 * but no sense in crashing the users box for something
 			 * we can survive anyway.
 			 */
-			if (!eb) {
+			if (WARN_ON(!eb)) {
 				spin_unlock(&mapping->private_lock);
-				WARN_ON(1);
 				continue;
 			}
 
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index ae8a5139e0ee..6f3848860283 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -849,10 +849,8 @@ insert:
 	path->leave_spinning = 0;
 	if (ret < 0)
 		goto fail_unlock;
-	if (ret != 0) {
-		WARN_ON(1);
+	if (WARN_ON(ret != 0))
 		goto fail_unlock;
-	}
 	leaf = path->nodes[0];
 csum:
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index ec08004a6a20..ab485e57b6fe 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -78,10 +78,8 @@ again:
 			    btrfs_transaction_in_commit(fs_info)) {
 				leaf = path->nodes[0];
 
-				if (btrfs_header_nritems(leaf) == 0) {
-					WARN_ON(1);
+				if (WARN_ON(btrfs_header_nritems(leaf) == 0))
 					break;
-				}
 
 				/*
 				 * Save the key so we can advances forward
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 17800221978f..ddecc9c332f4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2039,10 +2039,8 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
 		key.offset = offset;
 
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	if (ret < 0) {
-		WARN_ON(1);
+	if (WARN_ON(ret < 0))
 		return ret;
-	}
 	ret = 0;
 
 	while (1) {
@@ -3182,8 +3180,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 
 		/* if we have links, this was a truncate, lets do that */
 		if (inode->i_nlink) {
-			if (!S_ISREG(inode->i_mode)) {
-				WARN_ON(1);
+			if (WARN_ON(!S_ISREG(inode->i_mode))) {
 				iput(inode);
 				continue;
 			}
@@ -8023,8 +8020,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		if (ret == -EEXIST) {
 			/* we shouldn't get
 			 * eexist without a new_inode */
-			if (!new_inode) {
-				WARN_ON(1);
+			if (WARN_ON(!new_inode)) {
 				return ret;
 			}
 		} else {
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index a5f6a8061b21..70eca79bae52 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1777,8 +1777,7 @@ again:
 			new_ptr_gen = 0;
 		}
 
-		if (new_bytenr > 0 && new_bytenr == old_bytenr) {
-			WARN_ON(1);
+		if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
 			ret = level;
 			break;
 		}
@@ -3545,10 +3544,8 @@ static int find_data_references(struct reloc_control *rc,
 				err = ret;
 				goto out;
 			}
-			if (ret > 0) {
-				WARN_ON(1);
+			if (WARN_ON(ret > 0))
 				goto out;
-			}
 
 			leaf = path->nodes[0];
 			nritems = btrfs_header_nritems(leaf);
@@ -3568,11 +3565,9 @@ static int find_data_references(struct reloc_control *rc,
 		}
 
 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		if (key.objectid != ref_objectid ||
-		    key.type != BTRFS_EXTENT_DATA_KEY) {
-			WARN_ON(1);
+		if (WARN_ON(key.objectid != ref_objectid ||
+		    key.type != BTRFS_EXTENT_DATA_KEY))
 			break;
-		}
 
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e26a3a62fd3f..ec2f02435287 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -564,10 +564,8 @@ static int begin_cmd(struct send_ctx *sctx, int cmd)
 {
 	struct btrfs_cmd_header *hdr;
 
-	if (!sctx->send_buf) {
-		WARN_ON(1);
+	if (WARN_ON(!sctx->send_buf))
 		return -EINVAL;
-	}
 
 	BUG_ON(sctx->send_size);
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7927a5fa755c..744553c83fe2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2130,8 +2130,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
 		cur = path->nodes[*level];
 
-		if (btrfs_header_level(cur) != *level)
-			WARN_ON(1);
+		WARN_ON(btrfs_header_level(cur) != *level);
 
 		if (path->slots[*level] >=
 		    btrfs_header_nritems(cur))
@@ -2951,10 +2950,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 
 	/* find the first key from this transaction again */
 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
-	if (ret != 0) {
-		WARN_ON(1);
+	if (WARN_ON(ret != 0))
 		goto done;
-	}
 
 	/*
 	 * we have a block from this transaction, log every item in it
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 716abee16a5b..3c16da51348f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2552,8 +2552,7 @@ again:
 		failed = 0;
 		retried = true;
 		goto again;
-	} else if (failed && retried) {
-		WARN_ON(1);
+	} else if (WARN_ON(failed && retried)) {
 		ret = -ENOSPC;
 	}
 error:
@@ -5402,10 +5401,8 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio,
 		.bi_rw = bio->bi_rw,
 	};
 
-	if (bio->bi_vcnt == 0) {
-		WARN_ON(1);
+	if (WARN_ON(bio->bi_vcnt == 0))
 		return 1;
-	}
 
 	prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
 	if (bio_sectors(bio) > max_sectors)
@@ -5638,10 +5635,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 	struct btrfs_device *dev;
 	u64 tmp;
 
-	if (!devid && !fs_info) {
-		WARN_ON(1);
+	if (WARN_ON(!devid && !fs_info))
 		return ERR_PTR(-EINVAL);
-	}
 
 	dev = __alloc_device();
 	if (IS_ERR(dev))
-- 
cgit v1.3


From e248e04e7798d234ddbeab6ee6acdec2476fe1a8 Mon Sep 17 00:00:00 2001
From: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Date: Thu, 31 Oct 2013 10:31:13 +0530
Subject: btrfs: Enclose macros with complex values within parenthesis

Enclose macros with complex values within parenthesis in accordance to
checkpatch.pl.

Signed-off-by: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Reviewed-by: Zach Brown <zab@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent_map.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 61adc44b7805..93fba716d7f8 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -3,10 +3,10 @@
 
 #include <linux/rbtree.h>
 
-#define EXTENT_MAP_LAST_BYTE (u64)-4
-#define EXTENT_MAP_HOLE (u64)-3
-#define EXTENT_MAP_INLINE (u64)-2
-#define EXTENT_MAP_DELALLOC (u64)-1
+#define EXTENT_MAP_LAST_BYTE ((u64)-4)
+#define EXTENT_MAP_HOLE ((u64)-3)
+#define EXTENT_MAP_INLINE ((u64)-2)
+#define EXTENT_MAP_DELALLOC ((u64)-1)
 
 /* bits for the flags field */
 #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
-- 
cgit v1.3


From d9b0d9ba04cf99abff9125b688c03e154598a644 Mon Sep 17 00:00:00 2001
From: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Date: Thu, 31 Oct 2013 10:32:18 +0530
Subject: btrfs: Replace kmalloc with kmalloc_array

Replace kmalloc(size * nr, ) with kmalloc_array(nr, size), thus making
it easier to check is that the calculation doesn't wrap or return a smaller allocation

Signed-off-by: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Reviewed-by: Zach Brown <zab@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/delayed-inode.c | 4 ++--
 fs/btrfs/ioctl.c         | 2 +-
 fs/btrfs/super.c         | 2 +-
 fs/btrfs/volumes.c       | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 2bb8e1a988a1..8d292fbae659 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -770,13 +770,13 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
 	 */
 	btrfs_set_path_blocking(path);
 
-	keys = kmalloc(sizeof(struct btrfs_key) * nitems, GFP_NOFS);
+	keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS);
 	if (!keys) {
 		ret = -ENOMEM;
 		goto out;
 	}
 
-	data_size = kmalloc(sizeof(u32) * nitems, GFP_NOFS);
+	data_size = kmalloc_array(nitems, sizeof(u32), GFP_NOFS);
 	if (!data_size) {
 		ret = -ENOMEM;
 		goto error;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 78d20482b72a..3712ef86ca82 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1198,7 +1198,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		ra = &file->f_ra;
 	}
 
-	pages = kmalloc(sizeof(struct page *) * max_cluster,
+	pages = kmalloc_array(max_cluster, sizeof(struct page *),
 			GFP_NOFS);
 	if (!pages) {
 		ret = -ENOMEM;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c9079db7441e..1de6d4d67431 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1470,7 +1470,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 	nr_devices = fs_info->fs_devices->open_devices;
 	BUG_ON(!nr_devices);
 
-	devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
+	devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
 			       GFP_NOFS);
 	if (!devices_info)
 		return -ENOMEM;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3c16da51348f..0db637097862 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4901,7 +4901,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 			num_stripes = map->num_stripes;
 			max_errors = nr_parity_stripes(map);
 
-			raid_map = kmalloc(sizeof(u64) * num_stripes,
+			raid_map = kmalloc_array(num_stripes, sizeof(u64),
 					   GFP_NOFS);
 			if (!raid_map) {
 				ret = -ENOMEM;
-- 
cgit v1.3


From 678712545b62715a6c867471320ff5f60a521f3a Mon Sep 17 00:00:00 2001
From: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Date: Thu, 31 Oct 2013 10:33:04 +0530
Subject: btrfs: Fix checkpatch.pl warning of spacing issues

Fix spacing issues detected via checkpatch.pl in accordance with the
kernel style guidelines.

Signed-off-by: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/async-thread.c     |  2 +-
 fs/btrfs/compression.c      |  2 +-
 fs/btrfs/ctree.c            |  2 +-
 fs/btrfs/extent-tree.c      |  6 +++---
 fs/btrfs/extent_io.c        |  2 +-
 fs/btrfs/file.c             |  2 +-
 fs/btrfs/free-space-cache.c |  2 +-
 fs/btrfs/inode.c            |  6 +++---
 fs/btrfs/ioctl.c            | 14 +++++++-------
 9 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 08cc08f037a6..8aec751fa464 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -262,7 +262,7 @@ static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
 	struct btrfs_work *work = NULL;
 	struct list_head *cur = NULL;
 
-	if(!list_empty(prio_head))
+	if (!list_empty(prio_head))
 		cur = prio_head->next;
 
 	smp_mb();
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index df019c7f58b2..1499b27b4186 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -359,7 +359,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
 	bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
-	if(!bio) {
+	if (!bio) {
 		kfree(cb);
 		return -ENOMEM;
 	}
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 03c606cefd50..316136bd6dd7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4018,7 +4018,7 @@ again:
 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
 					if (data_size && !tried_avoid_double)
 						goto push_for_double;
-					split = 2 ;
+					split = 2;
 				}
 			}
 		}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2d58461bd035..fb5c76795eda 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5017,7 +5017,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
 
 	if (to_reserve)
-		trace_btrfs_space_reservation(root->fs_info,"delalloc",
+		trace_btrfs_space_reservation(root->fs_info, "delalloc",
 					      btrfs_ino(inode), to_reserve, 1);
 	block_rsv_add_bytes(block_rsv, to_reserve, 1);
 
@@ -8022,7 +8022,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
 
 	spin_lock(&sinfo->lock);
 
-	for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
 		if (!list_empty(&sinfo->block_groups[i]))
 			free_bytes += __btrfs_get_ro_block_group_free_space(
 						&sinfo->block_groups[i]);
@@ -8310,7 +8310,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 
 	release_global_block_rsv(info);
 
-	while(!list_empty(&info->space_info)) {
+	while (!list_empty(&info->space_info)) {
 		space_info = list_entry(info->space_info.next,
 					struct btrfs_space_info,
 					list);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fb782ed62426..856bc2b2192c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4034,7 +4034,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
 	if (offset >= last)
 		return NULL;
 
-	while(1) {
+	while (1) {
 		len = last - offset;
 		if (len == 0)
 			break;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3a20a12513b2..82d0342763c5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -369,7 +369,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
 	u64 root_objectid = 0;
 
 	atomic_inc(&fs_info->defrag_running);
-	while(1) {
+	while (1) {
 		/* Pause the auto defragger. */
 		if (test_bit(BTRFS_FS_STATE_REMOUNTING,
 			     &fs_info->fs_state))
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d7c445c30a16..057be95b1e1e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2280,7 +2280,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 		goto out;
 
 	entry = rb_entry(node, struct btrfs_free_space, offset_index);
-	while(1) {
+	while (1) {
 		if (entry->bytes < bytes && entry->bytes > *max_extent_size)
 			*max_extent_size = entry->bytes;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ddecc9c332f4..57fa19d69e90 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6262,7 +6262,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
 	/* adjust the range_start to make sure it doesn't
 	 * go backwards from the start they passed in
 	 */
-	range_start = max(start,range_start);
+	range_start = max(start, range_start);
 	found = found_end - range_start;
 
 	if (found > 0) {
@@ -7066,7 +7066,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 			}
 		} else {
 			submit_len += bvec->bv_len;
-			nr_pages ++;
+			nr_pages++;
 			bvec++;
 		}
 	}
@@ -8367,7 +8367,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	int err;
 	int drop_inode = 0;
 	u64 objectid;
-	u64 index = 0 ;
+	u64 index = 0;
 	int name_len;
 	int datasize;
 	unsigned long ptr;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3712ef86ca82..6523108d2984 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -686,7 +686,7 @@ static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
  *     nfs_async_unlink().
  */
 
-static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir)
+static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
 {
 	int error;
 
@@ -856,7 +856,7 @@ static int find_new_extents(struct btrfs_root *root,
 
 	path->keep_locks = 1;
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_forward(root, &min_key, path, newer_than);
 		if (ret != 0)
 			goto none;
@@ -1918,7 +1918,7 @@ static noinline int search_ioctl(struct inode *inode,
 
 	path->keep_locks = 1;
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_forward(root, &key, path, sk->min_transid);
 		if (ret != 0) {
 			if (ret > 0)
@@ -2004,7 +2004,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 	key.type = BTRFS_INODE_REF_KEY;
 	key.offset = (u64)-1;
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			goto out;
@@ -2033,7 +2033,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 		}
 
 		*(ptr + len) = '/';
-		read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len);
+		read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len);
 
 		if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
 			break;
@@ -2044,7 +2044,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 		dirid = key.objectid;
 	}
 	memmove(name, ptr, total_len);
-	name[total_len]='\0';
+	name[total_len] = '\0';
 	ret = 0;
 out:
 	btrfs_free_path(path);
@@ -2130,7 +2130,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 
 	inode = dentry->d_inode;
 	dest = BTRFS_I(inode)->root;
-	if (!capable(CAP_SYS_ADMIN)){
+	if (!capable(CAP_SYS_ADMIN)) {
 		/*
 		 * Regular user.  Only allow this with a special mount
 		 * option, when the user has write+exec access to the
-- 
cgit v1.3


From f570e757b54ca6eceabc9b19a6342f14e836c196 Mon Sep 17 00:00:00 2001
From: Rashika <rashika.kheria@gmail.com>
Date: Thu, 31 Oct 2013 16:50:42 +0530
Subject: btrfs: Remove useless variable in write_ctree_super()

The function write_ctree_super() in disk-io.c uses variable ret to return
the result of function write_all_supers(). Since, this variable serves
no purpose, hence the patch removes it and returns the call of the
called function.

Reviewed-by: Zach Brown <zab@redhat.com>
Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/disk-io.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 77fa2d99252f..4c4ed0bb3da1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3440,10 +3440,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
 int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root, int max_mirrors)
 {
-	int ret;
-
-	ret = write_all_supers(root, max_mirrors);
-	return ret;
+	return write_all_supers(root, max_mirrors);
 }
 
 /* Drop a fs root from the radix tree and free it. */
-- 
cgit v1.3


From 007d31f755294b9db69c3d18e90d6edae9f1604d Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Thu, 31 Oct 2013 16:49:02 -0400
Subject: Btrfs: check file extent type before anything else

I hit this problem with my no holes patch and it made me realize what the
problem was for bz 60834.  If the first item in the leaf is an inline extent and
we try to read anything starting from disk_bytenr onward we will read off the
end of the leaf.  So we need to check to see what it's type is, and if it's not
REG we can just break out.  This should fix this problem.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/send.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ec2f02435287..6837fe87f3a6 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3902,16 +3902,16 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 	while (key.offset < ekey->offset + left_len) {
 		ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 		right_type = btrfs_file_extent_type(eb, ei);
-		right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
-		right_len = btrfs_file_extent_num_bytes(eb, ei);
-		right_offset = btrfs_file_extent_offset(eb, ei);
-		right_gen = btrfs_file_extent_generation(eb, ei);
-
 		if (right_type != BTRFS_FILE_EXTENT_REG) {
 			ret = 0;
 			goto out;
 		}
 
+		right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
+		right_len = btrfs_file_extent_num_bytes(eb, ei);
+		right_offset = btrfs_file_extent_offset(eb, ei);
+		right_gen = btrfs_file_extent_generation(eb, ei);
+
 		/*
 		 * Are we at extent 8? If yes, we know the extent is changed.
 		 * This may only happen on the first iteration.
-- 
cgit v1.3


From 3b7a016f44d51ba8425c244f4c607f93fa213fd2 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Sat, 12 Oct 2013 02:11:12 +0800
Subject: Btrfs: avoid unnecessary scrub workers allocation

We only allocate scrub workers if we pass all the necessary
checks, for example, there are no operation in progress.

Besides, move mutex lock protection outside of scrub_workers_get()
/scrub_workers_put().

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/scrub.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 12009b4279ad..2544805544f0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2784,7 +2784,6 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 {
 	int ret = 0;
 
-	mutex_lock(&fs_info->scrub_lock);
 	if (fs_info->scrub_workers_refcnt == 0) {
 		if (is_dev_replace)
 			btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
@@ -2814,21 +2813,17 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 	}
 	++fs_info->scrub_workers_refcnt;
 out:
-	mutex_unlock(&fs_info->scrub_lock);
-
 	return ret;
 }
 
 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
 {
-	mutex_lock(&fs_info->scrub_lock);
 	if (--fs_info->scrub_workers_refcnt == 0) {
 		btrfs_stop_workers(&fs_info->scrub_workers);
 		btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
 		btrfs_stop_workers(&fs_info->scrub_nocow_workers);
 	}
 	WARN_ON(fs_info->scrub_workers_refcnt < 0);
-	mutex_unlock(&fs_info->scrub_lock);
 }
 
 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
@@ -2889,23 +2884,18 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		return -EINVAL;
 	}
 
-	ret = scrub_workers_get(fs_info, is_dev_replace);
-	if (ret)
-		return ret;
 
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
 	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
 	if (!dev || (dev->missing && !is_dev_replace)) {
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-		scrub_workers_put(fs_info);
 		return -ENODEV;
 	}
-	mutex_lock(&fs_info->scrub_lock);
 
+	mutex_lock(&fs_info->scrub_lock);
 	if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
 		mutex_unlock(&fs_info->scrub_lock);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-		scrub_workers_put(fs_info);
 		return -EIO;
 	}
 
@@ -2916,10 +2906,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		btrfs_dev_replace_unlock(&fs_info->dev_replace);
 		mutex_unlock(&fs_info->scrub_lock);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-		scrub_workers_put(fs_info);
 		return -EINPROGRESS;
 	}
 	btrfs_dev_replace_unlock(&fs_info->dev_replace);
+
+	ret = scrub_workers_get(fs_info, is_dev_replace);
+	if (ret) {
+		mutex_unlock(&fs_info->scrub_lock);
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		return ret;
+	}
+
 	sctx = scrub_setup_ctx(dev, is_dev_replace);
 	if (IS_ERR(sctx)) {
 		mutex_unlock(&fs_info->scrub_lock);
@@ -2957,10 +2954,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 
 	mutex_lock(&fs_info->scrub_lock);
 	dev->scrub_device = NULL;
+	scrub_workers_put(fs_info);
 	mutex_unlock(&fs_info->scrub_lock);
 
 	scrub_free_ctx(sctx);
-	scrub_workers_put(fs_info);
 
 	return ret;
 }
-- 
cgit v1.3


From d3ee29e39653455b7d799f77872d7df8c4060613 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Mon, 4 Nov 2013 23:13:20 +0800
Subject: Btrfs: remove unnecessary initialization and memory barrior in
 shrink_delalloc()

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fb5c76795eda..5e0e6bcf67b9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4034,15 +4034,14 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 	u64 delalloc_bytes;
 	u64 max_reclaim;
 	long time_left;
-	unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
-	int loops = 0;
+	unsigned long nr_pages;
+	int loops;
 	enum btrfs_reserve_flush_enum flush;
 
 	trans = (struct btrfs_trans_handle *)current->journal_info;
 	block_rsv = &root->fs_info->delalloc_block_rsv;
 	space_info = block_rsv->space_info;
 
-	smp_mb();
 	delalloc_bytes = percpu_counter_sum_positive(
 						&root->fs_info->delalloc_bytes);
 	if (delalloc_bytes == 0) {
@@ -4052,6 +4051,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 		return;
 	}
 
+	loops = 0;
 	while (delalloc_bytes && loops < 3) {
 		max_reclaim = min(delalloc_bytes, to_reclaim);
 		nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
@@ -4082,7 +4082,6 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 			if (time_left)
 				break;
 		}
-		smp_mb();
 		delalloc_bytes = percpu_counter_sum_positive(
 						&root->fs_info->delalloc_bytes);
 	}
-- 
cgit v1.3


From 38c135af8e22baf2ef3ebf9b213e398997a90946 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Mon, 4 Nov 2013 23:13:21 +0800
Subject: Btrfs: wait for the ordered extent only when we want

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5e0e6bcf67b9..2564cbfdf417 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4047,7 +4047,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 	if (delalloc_bytes == 0) {
 		if (trans)
 			return;
-		btrfs_wait_all_ordered_extents(root->fs_info);
+		if (wait_ordered)
+			btrfs_wait_all_ordered_extents(root->fs_info);
 		return;
 	}
 
-- 
cgit v1.3


From 18cd8ea6df32e72548a4b055cba54d1a898c81e5 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Mon, 4 Nov 2013 23:13:22 +0800
Subject: Btrfs: pick up the code for the item number calculation in
 flush_space()

This patch picked up the code that was used to calculate the number of
the items for which we need reserve space, and we will use it in the next
patch.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2564cbfdf417..9183f9cbc121 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4022,6 +4022,18 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
 	}
 }
 
+static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
+{
+	u64 bytes;
+	int nr;
+
+	bytes = btrfs_calc_trans_metadata_size(root, 1);
+	nr = (int)div64_u64(to_reclaim, bytes);
+	if (!nr)
+		nr = 1;
+	return nr;
+}
+
 /*
  * shrink metadata reservation for delalloc
  */
@@ -4167,16 +4179,11 @@ static int flush_space(struct btrfs_root *root,
 	switch (state) {
 	case FLUSH_DELAYED_ITEMS_NR:
 	case FLUSH_DELAYED_ITEMS:
-		if (state == FLUSH_DELAYED_ITEMS_NR) {
-			u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
-
-			nr = (int)div64_u64(num_bytes, bytes);
-			if (!nr)
-				nr = 1;
-			nr *= 2;
-		} else {
+		if (state == FLUSH_DELAYED_ITEMS_NR)
+			nr = calc_reclaim_items_nr(root, num_bytes) * 2;
+		else
 			nr = -1;
-		}
+
 		trans = btrfs_join_transaction(root);
 		if (IS_ERR(trans)) {
 			ret = PTR_ERR(trans);
-- 
cgit v1.3


From c61a16a701a12659e2933c5965afe8d45284146a Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Mon, 4 Nov 2013 23:13:23 +0800
Subject: Btrfs: fix the confusion between delalloc bytes and metadata bytes

In shrink_delalloc(), what we need reclaim is the metadata space, so
flushing pages by to_reclaim is not reasonable, it is very likely that
the pages we flush are not enough. And then we had to invoke the flush
function for several times, at the worst, we need call flush_space for
several times. It wasted time.

We improve this problem by converting the metadata space size we need
reserve to the delalloc bytes, By this way, we can flush the pages
by a reasonable number.

(Now we use a fixed number to do conversion, it is not flexible, maybe
 we can find a good way to improve it in the future.)

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9183f9cbc121..d8da538d01fb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4034,6 +4034,8 @@ static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
 	return nr;
 }
 
+#define EXTENT_SIZE_PER_ITEM	(256 * 1024)
+
 /*
  * shrink metadata reservation for delalloc
  */
@@ -4050,6 +4052,10 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 	int loops;
 	enum btrfs_reserve_flush_enum flush;
 
+	/* Calc the number of the pages we need flush for space reservation */
+	to_reclaim = calc_reclaim_items_nr(root, to_reclaim);
+	to_reclaim *= EXTENT_SIZE_PER_ITEM;
+
 	trans = (struct btrfs_trans_handle *)current->journal_info;
 	block_rsv = &root->fs_info->delalloc_block_rsv;
 	space_info = block_rsv->space_info;
-- 
cgit v1.3


From 9f3a074d108810139ad4af49a29d347a4cf41e9a Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Mon, 4 Nov 2013 23:13:24 +0800
Subject: Btrfs: don't wait for all the async delalloc when shrinking delalloc

It was very likely that there were lots of async delalloc pages in the
filesystem, if we waited until all the pages were flushed, we would be
blocked for a long time, and the performance would also drop down.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d8da538d01fb..83bffbea7d97 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4079,9 +4079,19 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 		 * We need to wait for the async pages to actually start before
 		 * we do anything.
 		 */
-		wait_event(root->fs_info->async_submit_wait,
-			   !atomic_read(&root->fs_info->async_delalloc_pages));
+		max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages);
+		if (!max_reclaim)
+			goto skip_async;
+
+		if (max_reclaim <= nr_pages)
+			max_reclaim = 0;
+		else
+			max_reclaim -= nr_pages;
 
+		wait_event(root->fs_info->async_submit_wait,
+			   atomic_read(&root->fs_info->async_delalloc_pages) <=
+			   (int)max_reclaim);
+skip_async:
 		if (!trans)
 			flush = BTRFS_RESERVE_FLUSH_ALL;
 		else
-- 
cgit v1.3


From b02441999efcc6152b87cd58e7970bb7843f76cf Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Mon, 4 Nov 2013 23:13:25 +0800
Subject: Btrfs: don't wait for the completion of all the ordered extents

It is very likely that there are lots of ordered extents in the filesytem,
if we wait for the completion of all of them when we want to reclaim some
space for the metadata space reservation, we would be blocked for a long
time. The performance would drop down suddenly for a long time.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/dev-replace.c  |  4 ++--
 fs/btrfs/extent-tree.c  | 11 ++++++-----
 fs/btrfs/ioctl.c        |  2 +-
 fs/btrfs/ordered-data.c | 22 +++++++++++++++++-----
 fs/btrfs/ordered-data.h |  4 ++--
 fs/btrfs/relocation.c   |  2 +-
 fs/btrfs/super.c        |  2 +-
 fs/btrfs/transaction.c  |  2 +-
 8 files changed, 31 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index cb94310db072..3d2495e83e62 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -391,7 +391,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
 	btrfs_dev_replace_unlock(dev_replace);
 
-	btrfs_wait_all_ordered_extents(root->fs_info);
+	btrfs_wait_ordered_roots(root->fs_info, -1);
 
 	/* force writing the updated state information to disk */
 	trans = btrfs_start_transaction(root, 0);
@@ -466,7 +466,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 		return ret;
 	}
-	btrfs_wait_all_ordered_extents(root->fs_info);
+	btrfs_wait_ordered_roots(root->fs_info, -1);
 
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 83bffbea7d97..a21bbf83fdc2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4018,7 +4018,7 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
 		 */
 		btrfs_start_all_delalloc_inodes(root->fs_info, 0);
 		if (!current->journal_info)
-			btrfs_wait_all_ordered_extents(root->fs_info);
+			btrfs_wait_ordered_roots(root->fs_info, -1);
 	}
 }
 
@@ -4050,11 +4050,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 	long time_left;
 	unsigned long nr_pages;
 	int loops;
+	int items;
 	enum btrfs_reserve_flush_enum flush;
 
 	/* Calc the number of the pages we need flush for space reservation */
-	to_reclaim = calc_reclaim_items_nr(root, to_reclaim);
-	to_reclaim *= EXTENT_SIZE_PER_ITEM;
+	items = calc_reclaim_items_nr(root, to_reclaim);
+	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
 
 	trans = (struct btrfs_trans_handle *)current->journal_info;
 	block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -4066,7 +4067,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 		if (trans)
 			return;
 		if (wait_ordered)
-			btrfs_wait_all_ordered_extents(root->fs_info);
+			btrfs_wait_ordered_roots(root->fs_info, items);
 		return;
 	}
 
@@ -4105,7 +4106,7 @@ skip_async:
 
 		loops++;
 		if (wait_ordered && !trans) {
-			btrfs_wait_all_ordered_extents(root->fs_info);
+			btrfs_wait_ordered_roots(root->fs_info, items);
 		} else {
 			time_left = schedule_timeout_killable(1);
 			if (time_left)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6523108d2984..d4f2861b0d9e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -572,7 +572,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 	if (ret)
 		return ret;
 
-	btrfs_wait_ordered_extents(root);
+	btrfs_wait_ordered_extents(root, -1);
 
 	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
 	if (!pending_snapshot)
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 8a5eff366596..25a8f3812f14 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -565,10 +565,11 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
  * wait for all the ordered extents in a root.  This is done when balancing
  * space between drives.
  */
-void btrfs_wait_ordered_extents(struct btrfs_root *root)
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
 {
 	struct list_head splice, works;
 	struct btrfs_ordered_extent *ordered, *next;
+	int count = 0;
 
 	INIT_LIST_HEAD(&splice);
 	INIT_LIST_HEAD(&works);
@@ -576,7 +577,7 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root)
 	mutex_lock(&root->fs_info->ordered_operations_mutex);
 	spin_lock(&root->ordered_extent_lock);
 	list_splice_init(&root->ordered_extents, &splice);
-	while (!list_empty(&splice)) {
+	while (!list_empty(&splice) && nr) {
 		ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
 					   root_extent_list);
 		list_move_tail(&ordered->root_extent_list,
@@ -591,7 +592,11 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root)
 
 		cond_resched();
 		spin_lock(&root->ordered_extent_lock);
+		if (nr != -1)
+			nr--;
+		count++;
 	}
+	list_splice_tail(&splice, &root->ordered_extents);
 	spin_unlock(&root->ordered_extent_lock);
 
 	list_for_each_entry_safe(ordered, next, &works, work_list) {
@@ -601,18 +606,21 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root)
 		cond_resched();
 	}
 	mutex_unlock(&root->fs_info->ordered_operations_mutex);
+
+	return count;
 }
 
-void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info)
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
 {
 	struct btrfs_root *root;
 	struct list_head splice;
+	int done;
 
 	INIT_LIST_HEAD(&splice);
 
 	spin_lock(&fs_info->ordered_root_lock);
 	list_splice_init(&fs_info->ordered_roots, &splice);
-	while (!list_empty(&splice)) {
+	while (!list_empty(&splice) && nr) {
 		root = list_first_entry(&splice, struct btrfs_root,
 					ordered_root);
 		root = btrfs_grab_fs_root(root);
@@ -621,10 +629,14 @@ void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info)
 			       &fs_info->ordered_roots);
 		spin_unlock(&fs_info->ordered_root_lock);
 
-		btrfs_wait_ordered_extents(root);
+		done = btrfs_wait_ordered_extents(root, nr);
 		btrfs_put_fs_root(root);
 
 		spin_lock(&fs_info->ordered_root_lock);
+		if (nr != -1) {
+			nr -= done;
+			WARN_ON(nr < 0);
+		}
 	}
 	spin_unlock(&fs_info->ordered_root_lock);
 }
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 3982db1c1b36..9b0450f7ac20 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -195,8 +195,8 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
 void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct inode *inode);
-void btrfs_wait_ordered_extents(struct btrfs_root *root);
-void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info);
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
 void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
 void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
 void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 70eca79bae52..e1b3c2c73c44 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4227,7 +4227,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 		err = ret;
 		goto out;
 	}
-	btrfs_wait_all_ordered_extents(fs_info);
+	btrfs_wait_ordered_roots(fs_info, -1);
 
 	while (1) {
 		mutex_lock(&fs_info->cleaner_mutex);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1de6d4d67431..2d8ac1bf0cf9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -920,7 +920,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 		return 0;
 	}
 
-	btrfs_wait_all_ordered_extents(fs_info);
+	btrfs_wait_ordered_roots(fs_info, -1);
 
 	trans = btrfs_attach_transaction_barrier(root);
 	if (IS_ERR(trans)) {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 277fe812d047..32c100b8c563 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1636,7 +1636,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
 static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
 {
 	if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
-		btrfs_wait_all_ordered_extents(fs_info);
+		btrfs_wait_ordered_roots(fs_info, -1);
 }
 
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
-- 
cgit v1.3


From 91aef86f3b8ab0685d930a5468254384513d1c97 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Mon, 4 Nov 2013 23:13:26 +0800
Subject: Btrfs: rename btrfs_start_all_delalloc_inodes

rename the function -- btrfs_start_all_delalloc_inodes(), and make its
name be compatible to btrfs_wait_ordered_roots(), since they are always
used at the same place.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ctree.h       | 3 +--
 fs/btrfs/dev-replace.c | 2 +-
 fs/btrfs/extent-tree.c | 2 +-
 fs/btrfs/inode.c       | 3 +--
 fs/btrfs/ioctl.c       | 2 +-
 fs/btrfs/relocation.c  | 2 +-
 fs/btrfs/transaction.c | 2 +-
 7 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 34279665fd69..aea4433081dc 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3680,8 +3680,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       u32 min_type);
 
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
-				    int delay_iput);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 			      struct extent_state **cached_state);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 3d2495e83e62..342f9fd411e3 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -461,7 +461,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	 * flush all outstanding I/O and inode extent mappings before the
 	 * copy operation is declared as being finished
 	 */
-	ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0);
+	ret = btrfs_start_delalloc_roots(root->fs_info, 0);
 	if (ret) {
 		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 		return ret;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a21bbf83fdc2..45d98d01028f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4016,7 +4016,7 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
 		 * the filesystem is readonly(all dirty pages are written to
 		 * the disk).
 		 */
-		btrfs_start_all_delalloc_inodes(root->fs_info, 0);
+		btrfs_start_delalloc_roots(root->fs_info, 0);
 		if (!current->journal_info)
 			btrfs_wait_ordered_roots(root->fs_info, -1);
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 57fa19d69e90..da8d2f696ac5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8306,8 +8306,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 	return ret;
 }
 
-int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
-				    int delay_iput)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
 {
 	struct btrfs_root *root;
 	struct list_head splice;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d4f2861b0d9e..1d04b5559e61 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4542,7 +4542,7 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_SYNC: {
 		int ret;
 
-		ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0);
+		ret = btrfs_start_delalloc_roots(root->fs_info, 0);
 		if (ret)
 			return ret;
 		ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e1b3c2c73c44..ce459a7cb16d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4222,7 +4222,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 	printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
 	       rc->block_group->key.objectid, rc->block_group->flags);
 
-	ret = btrfs_start_all_delalloc_inodes(fs_info, 0);
+	ret = btrfs_start_delalloc_roots(fs_info, 0);
 	if (ret < 0) {
 		err = ret;
 		goto out;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 32c100b8c563..57c16b46afbd 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1629,7 +1629,7 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
 static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
 {
 	if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
-		return btrfs_start_all_delalloc_inodes(fs_info, 1);
+		return btrfs_start_delalloc_roots(fs_info, 1);
 	return 0;
 }
 
-- 
cgit v1.3


From 9206c561554c948111d3cf6fc563a0beaaf790b3 Mon Sep 17 00:00:00 2001
From: Andreas Dilger <andreas.dilger@intel.com>
Date: Mon, 11 Nov 2013 22:38:12 -0500
Subject: ext4: return non-zero st_blocks for inline data

Return a non-zero st_blocks to userspace for statfs() and friends.
Some versions of tar will assume that files with st_blocks == 0
do not contain any data and will skip reading them entirely.

Signed-off-by: Andreas Dilger <andreas.dilger@intel.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 43015fa69c3a..075763474118 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4693,6 +4693,15 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	inode = dentry->d_inode;
 	generic_fillattr(inode, stat);
 
+	/*
+	 * If there is inline data in the inode, the inode will normally not
+	 * have data blocks allocated (it may have an external xattr block).
+	 * Report at least one sector for such files, so tools like tar, rsync,
+	 * others doen't incorrectly think the file is completely sparse.
+	 */
+	if (unlikely(ext4_has_inline_data(inode)))
+		stat->blocks += (stat->size + 511) >> 9;
+
 	/*
 	 * We can't update i_blocks if the block allocation is delayed
 	 * otherwise in the case of system crash before the real block
@@ -4704,9 +4713,8 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	 * blocks for this file.
 	 */
 	delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
-				EXT4_I(inode)->i_reserved_data_blocks);
-
-	stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9);
+				   EXT4_I(inode)->i_reserved_data_blocks);
+	stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9);
 	return 0;
 }
 
-- 
cgit v1.3


From 3f61c0cc706d5c0beee7af17ffeb706403cf513c Mon Sep 17 00:00:00 2001
From: Andreas Dilger <adilger@dilger.ca>
Date: Mon, 11 Nov 2013 22:40:40 -0500
Subject: ext4: add prototypes for macro-generated functions

It isn't very easy to find the declarations for the functions created
by EXT4_INODE_BIT_FNS() because the names are generated by macros:

    ext4_test_inode_flag, ext4_set_inode_flag, ext4_clear_inode_flag
    ext4_test_inode_state, ext4_set_inode_state, ext4_clear_inode_state

Add explicit declarations for these functions so that grep and tags
can find them.

Signed-off-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 65485ab7a889..2238e6097cfa 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1402,7 +1402,18 @@ static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
 	clear_bit(bit + (offset), &EXT4_I(inode)->i_##field);		\
 }
 
+/* Add these declarations here only so that these functions can be
+ * found by name.  Otherwise, they are very hard to locate. */
+static inline int ext4_test_inode_flag(struct inode *inode, int bit);
+static inline void ext4_set_inode_flag(struct inode *inode, int bit);
+static inline void ext4_clear_inode_flag(struct inode *inode, int bit);
 EXT4_INODE_BIT_FNS(flag, flags, 0)
+
+/* Add these declarations here only so that these functions can be
+ * found by name.  Otherwise, they are very hard to locate. */
+static inline int ext4_test_inode_state(struct inode *inode, int bit);
+static inline void ext4_set_inode_state(struct inode *inode, int bit);
+static inline void ext4_clear_inode_state(struct inode *inode, int bit);
 #if (BITS_PER_LONG < 64)
 EXT4_INODE_BIT_FNS(state, state_flags, 0)
 
-- 
cgit v1.3


From 06f9da6e826a0b459652b98a21541bca274bd440 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.de>
Date: Tue, 12 Nov 2013 15:06:52 -0800
Subject: fs/ocfs2: remove unnecessary variable bits_wanted from
 ocfs2_calc_extend_credits

Code cleanup to remove unnecessary variable passed but never used
to ocfs2_calc_extend_credits.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/aops.c         |  3 +--
 fs/ocfs2/dir.c          |  4 ++--
 fs/ocfs2/file.c         |  3 +--
 fs/ocfs2/journal.h      |  3 +--
 fs/ocfs2/move_extents.c |  3 +--
 fs/ocfs2/refcounttree.c |  9 +++------
 fs/ocfs2/xattr.c        | 20 +++++++-------------
 7 files changed, 16 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f37d3c0e2053..c0d4797618ef 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1802,8 +1802,7 @@ try_again:
 			data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
 
 		credits = ocfs2_calc_extend_credits(inode->i_sb,
-						    &di->id2.i_list,
-						    clusters_to_alloc);
+						    &di->id2.i_list);
 
 	}
 
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 30544ce8e9f7..93b079115edc 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3284,7 +3284,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 		if (ocfs2_dir_resv_allowed(osb))
 			data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
 
-		credits = ocfs2_calc_extend_credits(sb, el, 1);
+		credits = ocfs2_calc_extend_credits(sb, el);
 	} else {
 		spin_unlock(&OCFS2_I(dir)->ip_lock);
 		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
@@ -3716,7 +3716,7 @@ static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
 {
 	int credits = ocfs2_clusters_to_blocks(osb->sb, 2);
 
-	credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1);
+	credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list);
 	credits += ocfs2_quota_trans_credits(osb->sb);
 	return credits;
 }
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index d71903c6068b..67c037af301a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -603,8 +603,7 @@ restart_all:
 		goto leave;
 	}
 
-	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
-					    clusters_to_add);
+	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list);
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 0b479bab3671..9ff4e8cf9d97 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -524,8 +524,7 @@ static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
  * the result may be wrong.
  */
 static inline int ocfs2_calc_extend_credits(struct super_block *sb,
-					    struct ocfs2_extent_list *root_el,
-					    u32 bits_wanted)
+					    struct ocfs2_extent_list *root_el)
 {
 	int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks;
 
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 3d3f3c83065c..445678d9163c 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -201,8 +201,7 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode,
 		}
 	}
 
-	*credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
-					      clusters_to_move + 2);
+	*credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el);
 
 	mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
 	     extra_blocks, clusters_to_move, *credits);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index bf4dfc14bb2c..0022192e7ac8 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2502,8 +2502,7 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
 		ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
 		*meta_add += ocfs2_extend_meta_needed(et.et_root_el);
 		*credits += ocfs2_calc_extend_credits(sb,
-						      et.et_root_el,
-						      ref_blocks);
+						      et.et_root_el);
 	} else {
 		*credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
 		*meta_add += 1;
@@ -2874,8 +2873,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb,
 		meta_add =
 			ocfs2_extend_meta_needed(et->et_root_el);
 
-	*credits += ocfs2_calc_extend_credits(sb, et->et_root_el,
-					      num_clusters + 2);
+	*credits += ocfs2_calc_extend_credits(sb, et->et_root_el);
 
 	ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
 					       p_cluster, num_clusters,
@@ -3625,8 +3623,7 @@ int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
 
 		ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
 		*credits += ocfs2_calc_extend_credits(inode->i_sb,
-						      et.et_root_el,
-						      ref_blocks);
+						      et.et_root_el);
 	}
 
 out:
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 6ce0686eab72..aaaab8ba82ab 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -754,8 +754,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
 			BUG_ON(why == RESTART_META);
 
 			credits = ocfs2_calc_extend_credits(inode->i_sb,
-							    &vb->vb_xv->xr_list,
-							    clusters_to_add);
+							    &vb->vb_xv->xr_list);
 			status = ocfs2_extend_trans(handle, credits);
 			if (status < 0) {
 				status = -ENOMEM;
@@ -3040,8 +3039,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 		if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
 			clusters_add += new_clusters;
 			credits += ocfs2_calc_extend_credits(inode->i_sb,
-							&def_xv.xv.xr_list,
-							new_clusters);
+							&def_xv.xv.xr_list);
 		}
 
 		goto meta_guess;
@@ -3106,8 +3104,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 			if (!ocfs2_xattr_is_local(xe))
 				credits += ocfs2_calc_extend_credits(
 							inode->i_sb,
-							&def_xv.xv.xr_list,
-							new_clusters);
+							&def_xv.xv.xr_list);
 			goto out;
 		}
 	}
@@ -3132,9 +3129,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 			meta_add += ocfs2_extend_meta_needed(&xv->xr_list);
 			clusters_add += new_clusters - old_clusters;
 			credits += ocfs2_calc_extend_credits(inode->i_sb,
-							     &xv->xr_list,
-							     new_clusters -
-							     old_clusters);
+							     &xv->xr_list);
 			if (value_size >= OCFS2_XATTR_ROOT_SIZE)
 				goto out;
 		}
@@ -3180,7 +3175,7 @@ meta_guess:
 				 &xb->xb_attrs.xb_root.xt_list;
 			meta_add += ocfs2_extend_meta_needed(el);
 			credits += ocfs2_calc_extend_credits(inode->i_sb,
-							     el, 1);
+							     el);
 		} else
 			credits += OCFS2_SUBALLOC_ALLOC + 1;
 
@@ -6216,8 +6211,7 @@ static int ocfs2_value_metas_in_xattr_header(struct super_block *sb,
 			  le16_to_cpu(xv->xr_list.l_next_free_rec);
 
 		*credits += ocfs2_calc_extend_credits(sb,
-						&def_xv.xv.xr_list,
-						le32_to_cpu(xv->xr_clusters));
+						&def_xv.xv.xr_list);
 
 		/*
 		 * If the value is a tree with depth > 1, We don't go deep
@@ -6782,7 +6776,7 @@ static int ocfs2_lock_reflink_xattr_rec_allocators(
 		metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el);
 
 	*credits += ocfs2_calc_extend_credits(osb->sb,
-					      xt_et->et_root_el, len);
+					      xt_et->et_root_el);
 
 	if (metas.num_metas) {
 		ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas,
-- 
cgit v1.3


From f0cb0f0bca233935ac70707d47f5885419a7fd31 Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Tue, 12 Nov 2013 15:06:53 -0800
Subject: fs/ocfs2/file.c: fix wrong comment

Unwritten extent only exists for file systems which support holes.  But
the comment said was opposite meaning and also the comment is not very
clear, so rephase it.

Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 67c037af301a..6fff128cad16 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -580,7 +580,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 	int did_quota = 0;
 
 	/*
-	 * This function only exists for file systems which don't
+	 * Unwritten extent only exists for file systems which
 	 * support holes.
 	 */
 	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
-- 
cgit v1.3


From 7391a294b861bf2c3b762dfdcf61b9c5f1bffa1f Mon Sep 17 00:00:00 2001
From: Rui Xiang <rui.xiang@huawei.com>
Date: Tue, 12 Nov 2013 15:06:54 -0800
Subject: ocfs2: return ENOMEM when sb_getblk() fails

The only reason for sb_getblk() failing is if it can't allocate the
buffer_head.  So return ENOMEM instead when it fails.

[joseph.qi@huawei.com: ocfs2_symlink_get_block() and ocfs2_read_blocks_sync() and ocfs2_read_blocks() need the same change]
Signed-off-by: Rui Xiang <rui.xiang@huawei.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <joseph.qi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/alloc.c          | 2 +-
 fs/ocfs2/aops.c           | 2 ++
 fs/ocfs2/buffer_head_io.c | 4 ++--
 fs/ocfs2/dir.c            | 8 ++++----
 fs/ocfs2/namei.c          | 2 +-
 fs/ocfs2/refcounttree.c   | 6 +++---
 fs/ocfs2/suballoc.c       | 4 ++--
 fs/ocfs2/super.c          | 4 ++--
 fs/ocfs2/xattr.c          | 2 +-
 9 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 17e6bdde96c5..dc7411fe185d 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1025,7 +1025,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
 		for(i = count;  i < (num_got + count); i++) {
 			bhs[i] = sb_getblk(osb->sb, first_blkno);
 			if (bhs[i] == NULL) {
-				status = -EIO;
+				status = -ENOMEM;
 				mlog_errno(status);
 				goto bail;
 			}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c0d4797618ef..c20360002f29 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -80,6 +80,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 
 	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
 						    le32_to_cpu(fe->i_clusters))) {
+		err = -ENOMEM;
 		mlog(ML_ERROR, "block offset is outside the allocated size: "
 		     "%llu\n", (unsigned long long)iblock);
 		goto bail;
@@ -92,6 +93,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 			    iblock;
 		buffer_cache_bh = sb_getblk(osb->sb, blkno);
 		if (!buffer_cache_bh) {
+			err = -ENOMEM;
 			mlog(ML_ERROR, "couldn't getblock for symlink!\n");
 			goto bail;
 		}
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 5d18ad10c27f..5b704c63a103 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -115,7 +115,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
 		if (bhs[i] == NULL) {
 			bhs[i] = sb_getblk(osb->sb, block++);
 			if (bhs[i] == NULL) {
-				status = -EIO;
+				status = -ENOMEM;
 				mlog_errno(status);
 				goto bail;
 			}
@@ -214,7 +214,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
 			bhs[i] = sb_getblk(sb, block++);
 			if (bhs[i] == NULL) {
 				ocfs2_metadata_cache_io_unlock(ci);
-				status = -EIO;
+				status = -ENOMEM;
 				mlog_errno(status);
 				goto bail;
 			}
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 93b079115edc..91a7e85ac8fd 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2349,7 +2349,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
 
 	dx_root_bh = sb_getblk(osb->sb, dr_blkno);
 	if (dx_root_bh == NULL) {
-		ret = -EIO;
+		ret = -ENOMEM;
 		goto out;
 	}
 	ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh);
@@ -2422,7 +2422,7 @@ static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
 	for (i = 0; i < num_dx_leaves; i++) {
 		bh = sb_getblk(osb->sb, start_blk + i);
 		if (bh == NULL) {
-			ret = -EIO;
+			ret = -ENOMEM;
 			goto out;
 		}
 		dx_leaves[i] = bh;
@@ -2929,7 +2929,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
 	dirdata_bh = sb_getblk(sb, blkno);
 	if (!dirdata_bh) {
-		ret = -EIO;
+		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out_commit;
 	}
@@ -3159,7 +3159,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 
 	*new_bh = sb_getblk(sb, p_blkno);
 	if (!*new_bh) {
-		status = -EIO;
+		status = -ENOMEM;
 		mlog_errno(status);
 		goto bail;
 	}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index be3f8676a438..4f791f6d27d0 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -489,7 +489,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
 
 	*new_fe_bh = sb_getblk(osb->sb, fe_blkno);
 	if (!*new_fe_bh) {
-		status = -EIO;
+		status = -ENOMEM;
 		mlog_errno(status);
 		goto leave;
 	}
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 0022192e7ac8..70d083e60d40 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1310,7 +1310,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
 
 	new_bh = sb_getblk(sb, blkno);
 	if (new_bh == NULL) {
-		ret = -EIO;
+		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
 	}
@@ -1561,7 +1561,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
 
 	new_bh = sb_getblk(sb, blkno);
 	if (new_bh == NULL) {
-		ret = -EIO;
+		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
 	}
@@ -3029,7 +3029,7 @@ int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
 	for (i = 0; i < blocks; i++, old_block++, new_block++) {
 		new_bh = sb_getblk(osb->sb, new_block);
 		if (new_bh == NULL) {
-			ret = -EIO;
+			ret = -ENOMEM;
 			mlog_errno(ret);
 			break;
 		}
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 5397c07ce608..2c91452c4047 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -481,7 +481,7 @@ ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
 
 	bg_bh = sb_getblk(osb->sb, bg_blkno);
 	if (!bg_bh) {
-		status = -EIO;
+		status = -ENOMEM;
 		mlog_errno(status);
 		goto bail;
 	}
@@ -661,7 +661,7 @@ ocfs2_block_group_alloc_discontig(handle_t *handle,
 
 	bg_bh = sb_getblk(osb->sb, bg_blkno);
 	if (!bg_bh) {
-		status = -EIO;
+		status = -ENOMEM;
 		mlog_errno(status);
 		goto bail;
 	}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index d4e81e4a9b04..c41492957aa5 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1848,8 +1848,8 @@ static int ocfs2_get_sector(struct super_block *sb,
 
 	*bh = sb_getblk(sb, block);
 	if (!*bh) {
-		mlog_errno(-EIO);
-		return -EIO;
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
 	}
 	lock_buffer(*bh);
 	if (!buffer_dirty(*bh))
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index aaaab8ba82ab..82257299d5ea 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -377,7 +377,7 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
 		bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb,
 					      xb_blkno + i);
 		if (!bucket->bu_bhs[i]) {
-			rc = -EIO;
+			rc = -ENOMEM;
 			mlog_errno(rc);
 			break;
 		}
-- 
cgit v1.3


From 58796207cf09552706b0c3e0669f2ce52ea6cee1 Mon Sep 17 00:00:00 2001
From: Rui Xiang <rui.xiang@huawei.com>
Date: Tue, 12 Nov 2013 15:06:55 -0800
Subject: ocfs2: add necessary check in case sb_getblk() fails

sb_getblk() may return an err, so add a check for bh.

[joseph.qi@huawei.com: also add a check after calling sb_getblk() in ocfs2_create_xattr_block()]
Signed-off-by: Rui Xiang <rui.xiang@huawei.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <joseph.qi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/refcounttree.c | 5 +++++
 fs/ocfs2/xattr.c        | 6 ++++++
 2 files changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 70d083e60d40..55767e1ba724 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -612,6 +612,11 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
 	}
 
 	new_bh = sb_getblk(inode->i_sb, first_blkno);
+	if (!new_bh) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out_commit;
+	}
 	ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
 
 	ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 82257299d5ea..f0a1326d9bba 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2864,6 +2864,12 @@ static int ocfs2_create_xattr_block(struct inode *inode,
 	}
 
 	new_bh = sb_getblk(inode->i_sb, first_blkno);
+	if (!new_bh) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto end;
+	}
+
 	ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
 
 	ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode),
-- 
cgit v1.3


From 910bffe0861c956096c31ad7e887cc23169955ed Mon Sep 17 00:00:00 2001
From: Joel Becker <jlbec@evilplan.org>
Date: Tue, 12 Nov 2013 15:06:57 -0800
Subject: ocfs2: don't spam on -EDQUOT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

-EDQUOT is a user-visible error, not a logic problem.  Teach mlog_errno()
to ignore it like it ignores -ENOSPC, etc.

Signed-off-by: Joel Becker <jlbec@evilplan.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reported-by: Marek Królikowski <admin@wset.edu.pl>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/masklog.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index baa2b9ef7eef..2260fb9e6508 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -199,7 +199,8 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 #define mlog_errno(st) do {						\
 	int _st = (st);							\
 	if (_st != -ERESTARTSYS && _st != -EINTR &&			\
-	    _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC)		\
+	    _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC &&		\
+	    _st != -EDQUOT)						\
 		mlog(ML_ERROR, "status = %lld\n", (long long)_st);	\
 } while (0)
 
-- 
cgit v1.3


From a8f70de37bbb991033208edff7758d997d68db37 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 12 Nov 2013 15:06:58 -0800
Subject: ocfs2: use bitmap_weight()

Use bitmap_weight() instead of reinventing the wheel.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/heartbeat.c | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 363f0dcc924f..292a13b05b6e 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -35,6 +35,7 @@
 #include <linux/time.h>
 #include <linux/debugfs.h>
 #include <linux/slab.h>
+#include <linux/bitmap.h>
 
 #include "heartbeat.h"
 #include "tcp.h"
@@ -282,15 +283,6 @@ struct o2hb_bio_wait_ctxt {
 	int               wc_error;
 };
 
-static int o2hb_pop_count(void *map, int count)
-{
-	int i = -1, pop = 0;
-
-	while ((i = find_next_bit(map, count, i + 1)) < count)
-		pop++;
-	return pop;
-}
-
 static void o2hb_write_timeout(struct work_struct *work)
 {
 	int failed, quorum;
@@ -307,9 +299,9 @@ static void o2hb_write_timeout(struct work_struct *work)
 		spin_lock_irqsave(&o2hb_live_lock, flags);
 		if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
 			set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
-		failed = o2hb_pop_count(&o2hb_failed_region_bitmap,
+		failed = bitmap_weight(o2hb_failed_region_bitmap,
 					O2NM_MAX_REGIONS);
-		quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap,
+		quorum = bitmap_weight(o2hb_quorum_region_bitmap,
 					O2NM_MAX_REGIONS);
 		spin_unlock_irqrestore(&o2hb_live_lock, flags);
 
@@ -765,7 +757,7 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg)
 	 * If global heartbeat active, unpin all regions if the
 	 * region count > CUT_OFF
 	 */
-	if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+	if (bitmap_weight(o2hb_quorum_region_bitmap,
 			   O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
 		o2hb_region_unpin(NULL);
 unlock:
@@ -1829,7 +1821,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 	live_threshold = O2HB_LIVE_THRESHOLD;
 	if (o2hb_global_heartbeat_active()) {
 		spin_lock(&o2hb_live_lock);
-		if (o2hb_pop_count(&o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1)
+		if (bitmap_weight(o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1)
 			live_threshold <<= 1;
 		spin_unlock(&o2hb_live_lock);
 	}
@@ -2180,7 +2172,7 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
 	if (!o2hb_dependent_users)
 		goto unlock;
 
-	if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+	if (bitmap_weight(o2hb_quorum_region_bitmap,
 			   O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
 		o2hb_region_pin(NULL);
 
@@ -2480,7 +2472,7 @@ static int o2hb_region_inc_user(const char *region_uuid)
 	if (o2hb_dependent_users > 1)
 		goto unlock;
 
-	if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+	if (bitmap_weight(o2hb_quorum_region_bitmap,
 			   O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
 		ret = o2hb_region_pin(NULL);
 
-- 
cgit v1.3


From 750e3c6581304c7dff813ac2317f5d733f84cb23 Mon Sep 17 00:00:00 2001
From: Xue jiufei <xuejiufei@huawei.com>
Date: Tue, 12 Nov 2013 15:06:59 -0800
Subject: ocfs2: skip locks in the blocked list

A parallel umount on 4 nodes triggered a bug in
dlm_process_recovery_date().  Here's the situation:

Receiving MIG_LOCKRES message, A node processes the locks in migratable
lockres.  It copys lvb from migratable lockres when processing the first
valid lock.

If there is a lock in the blocked list with the EX level, it triggers the
BUG.  Since valid lvbs are set when locks are granted with EX or PR
levels, locks in the blocked list cannot have valid lvbs.  Therefore I
think we should skip the locks in the blocked list.

Signed-off-by: Xuejiufei <xuejiufei@huawei.com>
Signed-off-by: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmrecovery.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 0b5adca1b178..7035af09cc03 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1886,6 +1886,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 		if (ml->type == LKM_NLMODE)
 			goto skip_lvb;
 
+		/*
+		 * If the lock is in the blocked list it can't have a valid lvb,
+		 * so skip it
+		 */
+		if (ml->list == DLM_BLOCKED_LIST)
+			goto skip_lvb;
+
 		if (!dlm_lvb_is_empty(mres->lvb)) {
 			if (lksb->flags & DLM_LKSB_PUT_LVB) {
 				/* other node was trying to update
-- 
cgit v1.3


From fae477b6f005d6bd8a6fb3f052b30d55cfa70c26 Mon Sep 17 00:00:00 2001
From: Xue jiufei <xuejiufei@huawei.com>
Date: Tue, 12 Nov 2013 15:07:00 -0800
Subject: ocfs2: delay migration when the lockres is in migration state

We trigger a bug in __dlm_lockres_reserve_ast() when we parallel umount 4
nodes.  The situation is as follows:

1) Node A migrate all lockres it owned(eg.  lockres A) to other nodes
   say node B when it umounts.

2) Receiving MIG_LOCKRES message from A, Node B masters the lockres A
   with DLM_LOCK_RES_MIGRATING state set.

3) Then we umount ocfs2 on node B.  It also should migrate lockres A to
   another node, say node C.  But now, DLM_LOCK_RES_MIGRATING state of
   lockers A is not cleared.  Node B triggered the BUG on lockres with
   state DLM_LOCK_RES_MIGRATING.

Signed-off-by: Xuejiufei <xuejiufei@huawei.com>
Signed-off-by: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Tariq Saeed <tariq.x.saeed@oracle.com>
Cc: Srinivas Eeda <srinivas.eeda@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmmaster.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index cf0f103963b1..5a0ef78fe029 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2354,6 +2354,10 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
 
 	assert_spin_locked(&res->spinlock);
 
+	/* delay migration when the lockres is in MIGRATING state */
+	if (res->state & DLM_LOCK_RES_MIGRATING)
+		return 0;
+
 	if (res->owner != dlm->node_num)
 		return 0;
 
-- 
cgit v1.3


From 518df6bdf38559247ee90b9e59a40fae779d57d0 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 12 Nov 2013 15:07:01 -0800
Subject: ocfs2: use find_last_bit()

We already have find_last_bit().  So just use it as described in the
comment.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/heartbeat.c | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 292a13b05b6e..73920ffda05b 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -946,23 +946,9 @@ out:
 	return changed;
 }
 
-/* This could be faster if we just implmented a find_last_bit, but I
- * don't think the circumstances warrant it. */
-static int o2hb_highest_node(unsigned long *nodes,
-			     int numbits)
+static int o2hb_highest_node(unsigned long *nodes, int numbits)
 {
-	int highest, node;
-
-	highest = numbits;
-	node = -1;
-	while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) {
-		if (node >= numbits)
-			break;
-
-		highest = node;
-	}
-
-	return highest;
+	return find_last_bit(nodes, numbits);
 }
 
 static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
-- 
cgit v1.3


From 728b98059a2f19393f906ba4febc0f2d1d15f280 Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Tue, 12 Nov 2013 15:07:02 -0800
Subject: ocfs2: break useless while loop

Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Signed-off-by: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmmaster.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 5a0ef78fe029..af3f7aa73e13 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1885,8 +1885,10 @@ ok:
 			 * up nodes that this node contacted */
 			while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
 						    nn+1)) < O2NM_MAX_NODES) {
-				if (nn != dlm->node_num && nn != assert->node_idx)
+				if (nn != dlm->node_num && nn != assert->node_idx) {
 					master_request = 1;
+					break;
+				}
 			}
 		}
 		mle->master = assert->node_idx;
-- 
cgit v1.3


From eedd40e1cad4217c9b7e2577debcdd0c48732cc3 Mon Sep 17 00:00:00 2001
From: Younger Liu <younger.liu@huawei.com>
Date: Tue, 12 Nov 2013 15:07:03 -0800
Subject: ocfs2: rollback transaction in ocfs2_group_add()

If ocfs2_journal_access_di() fails, group->bg_next_group should rollback.
Otherwise, there would be a inconsistency between group_bh and main_bm_bh.

Signed-off-by: Younger Liu <younger.liu@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Acked-by: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/resize.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index ec55add7604a..376750f7883e 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -469,6 +469,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 	struct ocfs2_chain_list *cl;
 	struct ocfs2_chain_rec *cr;
 	u16 cl_bpc;
+	u64 bg_ptr;
 
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 		return -EROFS;
@@ -538,12 +539,14 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 	}
 
 	group = (struct ocfs2_group_desc *)group_bh->b_data;
+	bg_ptr = le64_to_cpu(group->bg_next_group);
 	group->bg_next_group = cr->c_blkno;
 	ocfs2_journal_dirty(handle, group_bh);
 
 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
 				      main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
+		group->bg_next_group = cpu_to_le64(bg_ptr);
 		mlog_errno(ret);
 		goto out_commit;
 	}
-- 
cgit v1.3


From 8abaae8d85d4e1de0fcc81b733ba04a5d318ba7a Mon Sep 17 00:00:00 2001
From: Younger Liu <younger.liu@huawei.com>
Date: Tue, 12 Nov 2013 15:07:04 -0800
Subject: ocfs2: do not call brelse() if group_bh is not initialized in
 ocfs2_group_add()

If group_bh is not initialized, there is no need to release.  This
problem does not cause anything wrong, but the patch would make the code
more logical.

Signed-off-by: Younger Liu <younger.liu@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Acked-by: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/resize.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 376750f7883e..822ebc10f281 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -514,7 +514,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 	ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_unlock;
+		goto out_free_group_bh;
 	}
 
 	trace_ocfs2_group_add((unsigned long long)input->group,
@@ -524,7 +524,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 	if (IS_ERR(handle)) {
 		mlog_errno(PTR_ERR(handle));
 		ret = -EINVAL;
-		goto out_unlock;
+		goto out_free_group_bh;
 	}
 
 	cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
@@ -577,8 +577,11 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 
 out_commit:
 	ocfs2_commit_trans(osb, handle);
-out_unlock:
+
+out_free_group_bh:
 	brelse(group_bh);
+
+out_unlock:
 	brelse(main_bm_bh);
 
 	ocfs2_inode_unlock(main_bm_inode, 1);
-- 
cgit v1.3


From bfbca926d67c9bf3212d1077c6afd48d2995a285 Mon Sep 17 00:00:00 2001
From: Younger Liu <younger.liu@huawei.com>
Date: Tue, 12 Nov 2013 15:07:05 -0800
Subject: ocfs2: add missing errno in ocfs2_ioctl_move_extents()

If the file is not regular or writeable, it should return errno(EPERM).

This patch is based on 85a258b70d ("ocfs2: fix error handling in
ocfs2_ioctl_move_extents()").

Signed-off-by: Younger Liu <younger.liu@huawei.com>
Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/move_extents.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 445678d9163c..631a98213474 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -1066,8 +1066,10 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
 	if (status)
 		return status;
 
-	if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
+	if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) {
+		status = -EPERM;
 		goto out_drop;
+	}
 
 	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
 		status = -EPERM;
@@ -1089,8 +1091,10 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
 		goto out_free;
 	}
 
-	if (range.me_start > i_size_read(inode))
+	if (range.me_start > i_size_read(inode)) {
+		status = -EINVAL;
 		goto out_free;
+	}
 
 	if (range.me_start + range.me_len > i_size_read(inode))
 			range.me_len = i_size_read(inode) - range.me_start;
-- 
cgit v1.3


From b1214e4757b7d5fcea483b927e130361d41430a5 Mon Sep 17 00:00:00 2001
From: Xue jiufei <xuejiufei@huawei.com>
Date: Tue, 12 Nov 2013 15:07:06 -0800
Subject: ocfs2: fix possible double free in ocfs2_write_begin_nolock

When ocfs2_write_cluster_by_desc() failed in ocfs2_write_begin_nolock()
because of ENOSPC, it goes to out_quota, freeing data_ac(meta_ac).  Then
it calls ocfs2_try_to_free_truncate_log() to free space.  If enough
space freed, it will try to write again.  Unfortunately, some error
happenes before ocfs2_lock_allocators(), it goes to out and free
data_ac(meta_ac) again.

Signed-off-by: joyce <xuejiufei@huawei.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Acked-by: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/aops.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c20360002f29..f959a1532767 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1898,10 +1898,14 @@ out_commit:
 out:
 	ocfs2_free_write_ctxt(wc);
 
-	if (data_ac)
+	if (data_ac) {
 		ocfs2_free_alloc_context(data_ac);
-	if (meta_ac)
+		data_ac = NULL;
+	}
+	if (meta_ac) {
 		ocfs2_free_alloc_context(meta_ac);
+		meta_ac = NULL;
+	}
 
 	if (ret == -ENOSPC && try_free) {
 		/*
-- 
cgit v1.3


From d00d2f8ab9f20f8e6a0fc3804847b418498d80b8 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 12 Nov 2013 15:07:07 -0800
Subject: ocfs2: convert use of typedef ctl_table to struct ctl_table

This typedef is unnecessary and should just be removed.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Acked-by: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/stackglue.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 39abf89697ed..cb7ec0b63ddc 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -643,7 +643,7 @@ error:
 
 #define FS_OCFS2_NM		1
 
-static ctl_table ocfs2_nm_table[] = {
+static struct ctl_table ocfs2_nm_table[] = {
 	{
 		.procname	= "hb_ctl_path",
 		.data		= ocfs2_hb_ctl_path,
@@ -654,7 +654,7 @@ static ctl_table ocfs2_nm_table[] = {
 	{ }
 };
 
-static ctl_table ocfs2_mod_table[] = {
+static struct ctl_table ocfs2_mod_table[] = {
 	{
 		.procname	= "nm",
 		.data		= NULL,
@@ -665,7 +665,7 @@ static ctl_table ocfs2_mod_table[] = {
 	{ }
 };
 
-static ctl_table ocfs2_kern_table[] = {
+static struct ctl_table ocfs2_kern_table[] = {
 	{
 		.procname	= "ocfs2",
 		.data		= NULL,
@@ -676,7 +676,7 @@ static ctl_table ocfs2_kern_table[] = {
 	{ }
 };
 
-static ctl_table ocfs2_root_table[] = {
+static struct ctl_table ocfs2_root_table[] = {
 	{
 		.procname	= "fs",
 		.data		= NULL,
-- 
cgit v1.3


From 41ecc345984bcc8bf341a3e758c1eb3fc543dd83 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 12 Nov 2013 15:07:08 -0800
Subject: ocfs2: simplify ocfs2_invalidatepage() and ocfs2_releasepage()

Ocfs2 doesn't do data journalling.  Thus its ->invalidatepage and
->releasepage functions never get called on buffers that have journal
heads attached.  So just use standard variants of functions from
buffer.c.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/aops.c | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f959a1532767..aeb44e879c51 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -594,26 +594,11 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
 	ocfs2_rw_unlock(inode, level);
 }
 
-/*
- * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen
- * from ext3.  PageChecked() bits have been removed as OCFS2 does not
- * do journalled data.
- */
-static void ocfs2_invalidatepage(struct page *page, unsigned int offset,
-				 unsigned int length)
-{
-	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
-
-	jbd2_journal_invalidatepage(journal, page, offset, length);
-}
-
 static int ocfs2_releasepage(struct page *page, gfp_t wait)
 {
-	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
-
 	if (!page_has_buffers(page))
 		return 0;
-	return jbd2_journal_try_to_free_buffers(journal, page, wait);
+	return try_to_free_buffers(page);
 }
 
 static ssize_t ocfs2_direct_IO(int rw,
@@ -2092,7 +2077,7 @@ const struct address_space_operations ocfs2_aops = {
 	.write_end		= ocfs2_write_end,
 	.bmap			= ocfs2_bmap,
 	.direct_IO		= ocfs2_direct_IO,
-	.invalidatepage		= ocfs2_invalidatepage,
+	.invalidatepage		= block_invalidatepage,
 	.releasepage		= ocfs2_releasepage,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate	= block_is_partially_uptodate,
-- 
cgit v1.3


From 83285c72e08c42848808039ef2d3b67a1bb88832 Mon Sep 17 00:00:00 2001
From: Xishi Qiu <qiuxishi@huawei.com>
Date: Tue, 12 Nov 2013 15:07:19 -0800
Subject: mm: use pgdat_end_pfn() to simplify the code in others

Use "pgdat_end_pfn()" instead of "pgdat->node_start_pfn +
pgdat->node_spanned_pages".  Simplify the code, no functional change.

Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/kcore.c     | 3 +--
 mm/bootmem.c        | 2 +-
 mm/memory_hotplug.c | 9 ++++-----
 3 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 06ea155e1a59..5ed0e52d6aa0 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -255,8 +255,7 @@ static int kcore_update_ram(void)
 	end_pfn = 0;
 	for_each_node_state(nid, N_MEMORY) {
 		unsigned long node_end;
-		node_end  = NODE_DATA(nid)->node_start_pfn +
-			NODE_DATA(nid)->node_spanned_pages;
+		node_end = node_end_pfn(nid);
 		if (end_pfn < node_end)
 			end_pfn = node_end;
 	}
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 6ab7744e692e..95b528cd4de7 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -784,7 +784,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
 	/* update goal according ...MAX_DMA32_PFN */
-	end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
+	end_pfn = pgdat_end_pfn(pgdat);
 
 	if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
 	    (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ed85fe3870e2..375a42d76b2c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -365,8 +365,7 @@ out_fail:
 static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
 			    unsigned long end_pfn)
 {
-	unsigned long old_pgdat_end_pfn =
-		pgdat->node_start_pfn + pgdat->node_spanned_pages;
+	unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat);
 
 	if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
 		pgdat->node_start_pfn = start_pfn;
@@ -579,9 +578,9 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
 static void shrink_pgdat_span(struct pglist_data *pgdat,
 			      unsigned long start_pfn, unsigned long end_pfn)
 {
-	unsigned long pgdat_start_pfn =  pgdat->node_start_pfn;
-	unsigned long pgdat_end_pfn =
-		pgdat->node_start_pfn + pgdat->node_spanned_pages;
+	unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
+	unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
+	unsigned long pgdat_end_pfn = p;
 	unsigned long pfn;
 	struct mem_section *ms;
 	int nid = pgdat->node_id;
-- 
cgit v1.3


From 948927ee9e4f35f287e61a79c9f0e85ca2202c7d Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 12 Nov 2013 15:07:28 -0800
Subject: mm, mempolicy: make mpol_to_str robust and always succeed

mpol_to_str() should not fail.  Currently, it either fails because the
string buffer is too small or because a string hasn't been defined for a
mempolicy mode.

If a new mempolicy mode is introduced and no string is defined for it,
just warn and return "unknown".

If the buffer is too small, just truncate the string and return, the
same behavior as snprintf().

This also fixes a bug where there was no NULL-byte termination when doing
*p++ = '=' and *p++ ':' and maxlen has been reached.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Chen Gang <gang.chen@asianux.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Dave Jones <davej@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c        | 14 ++++++-------
 include/linux/mempolicy.h |  5 ++---
 mm/mempolicy.c            | 52 +++++++++++++++--------------------------------
 3 files changed, 24 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 390bdab01c3c..9f1369fe0afb 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1387,8 +1387,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	struct mm_struct *mm = vma->vm_mm;
 	struct mm_walk walk = {};
 	struct mempolicy *pol;
-	int n;
-	char buffer[50];
+	char buffer[64];
+	int nid;
 
 	if (!mm)
 		return 0;
@@ -1404,10 +1404,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	walk.mm = mm;
 
 	pol = get_vma_policy(task, vma, vma->vm_start);
-	n = mpol_to_str(buffer, sizeof(buffer), pol);
+	mpol_to_str(buffer, sizeof(buffer), pol);
 	mpol_cond_put(pol);
-	if (n < 0)
-		return n;
 
 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
 
@@ -1460,9 +1458,9 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	if (md->writeback)
 		seq_printf(m, " writeback=%lu", md->writeback);
 
-	for_each_node_state(n, N_MEMORY)
-		if (md->node[n])
-			seq_printf(m, " N%d=%lu", n, md->node[n]);
+	for_each_node_state(nid, N_MEMORY)
+		if (md->node[nid])
+			seq_printf(m, " N%d=%lu", nid, md->node[nid]);
 out:
 	seq_putc(m, '\n');
 
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index ea4d2495c646..9fe426b30a41 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -169,7 +169,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 extern int mpol_parse_str(char *str, struct mempolicy **mpol);
 #endif
 
-extern int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
+extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
 
 /* Check if a vma is migratable */
 static inline int vma_migratable(struct vm_area_struct *vma)
@@ -307,9 +307,8 @@ static inline int mpol_parse_str(char *str, struct mempolicy **mpol)
 }
 #endif
 
-static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+static inline void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 {
-	return 0;
 }
 
 static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 71cb253368cb..260b8213a873 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2914,62 +2914,45 @@ out:
  * @maxlen:  length of @buffer
  * @pol:  pointer to mempolicy to be formatted
  *
- * Convert a mempolicy into a string.
- * Returns the number of characters in buffer (if positive)
- * or an error (negative)
+ * Convert @pol into a string.  If @buffer is too short, truncate the string.
+ * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
+ * longest flag, "relative", and to display at least a few node ids.
  */
-int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 {
 	char *p = buffer;
-	int l;
-	nodemask_t nodes;
-	unsigned short mode;
-	unsigned short flags = pol ? pol->flags : 0;
-
-	/*
-	 * Sanity check:  room for longest mode, flag and some nodes
-	 */
-	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
+	nodemask_t nodes = NODE_MASK_NONE;
+	unsigned short mode = MPOL_DEFAULT;
+	unsigned short flags = 0;
 
-	if (!pol || pol == &default_policy)
-		mode = MPOL_DEFAULT;
-	else
+	if (pol && pol != &default_policy) {
 		mode = pol->mode;
+		flags = pol->flags;
+	}
 
 	switch (mode) {
 	case MPOL_DEFAULT:
-		nodes_clear(nodes);
 		break;
-
 	case MPOL_PREFERRED:
-		nodes_clear(nodes);
 		if (flags & MPOL_F_LOCAL)
 			mode = MPOL_LOCAL;
 		else
 			node_set(pol->v.preferred_node, nodes);
 		break;
-
 	case MPOL_BIND:
-		/* Fall through */
 	case MPOL_INTERLEAVE:
 		nodes = pol->v.nodes;
 		break;
-
 	default:
-		return -EINVAL;
+		WARN_ON_ONCE(1);
+		snprintf(p, maxlen, "unknown");
+		return;
 	}
 
-	l = strlen(policy_modes[mode]);
-	if (buffer + maxlen < p + l + 1)
-		return -ENOSPC;
-
-	strcpy(p, policy_modes[mode]);
-	p += l;
+	p += snprintf(p, maxlen, policy_modes[mode]);
 
 	if (flags & MPOL_MODE_FLAGS) {
-		if (buffer + maxlen < p + 2)
-			return -ENOSPC;
-		*p++ = '=';
+		p += snprintf(p, buffer + maxlen - p, "=");
 
 		/*
 		 * Currently, the only defined flags are mutually exclusive
@@ -2981,10 +2964,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	}
 
 	if (!nodes_empty(nodes)) {
-		if (buffer + maxlen < p + 2)
-			return -ENOSPC;
-		*p++ = ':';
+		p += snprintf(p, buffer + maxlen - p, ":");
 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
 	}
-	return p - buffer;
 }
-- 
cgit v1.3


From ec8e41aec13005fed0dbee002fb8c99b4e001d50 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Tue, 12 Nov 2013 15:07:49 -0800
Subject: /proc/pid/smaps: show VM_SOFTDIRTY flag in VmFlags line

This flag shows that the VMA is "newly created" and thus represents
"dirty" in the task's VM.

You can clear it by "echo 4 > /proc/pid/clear_refs."

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 1 +
 fs/proc/task_mmu.c                 | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'fs')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 823c95faebd2..22d89aa37218 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -460,6 +460,7 @@ manner. The codes are the following:
     nl  - non-linear mapping
     ar  - architecture specific flag
     dd  - do not include area into core dump
+    sd  - soft-dirty flag
     mm  - mixed map area
     hg  - huge page advise flag
     nh  - no-huge page advise flag
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9f1369fe0afb..abbe825d20ff 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -561,6 +561,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_NONLINEAR)]	= "nl",
 		[ilog2(VM_ARCH_1)]	= "ar",
 		[ilog2(VM_DONTDUMP)]	= "dd",
+#ifdef CONFIG_MEM_SOFT_DIRTY
+		[ilog2(VM_SOFTDIRTY)]	= "sd",
+#endif
 		[ilog2(VM_MIXEDMAP)]	= "mm",
 		[ilog2(VM_HUGEPAGE)]	= "hg",
 		[ilog2(VM_NOHUGEPAGE)]	= "nh",
-- 
cgit v1.3


From c4a391b53a72d2df4ee97f96f78c1d5971b47489 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 12 Nov 2013 15:07:51 -0800
Subject: writeback: do not sync data dirtied after sync start

When there are processes heavily creating small files while sync(2) is
running, it can easily happen that quite some new files are created
between WB_SYNC_NONE and WB_SYNC_ALL pass of sync(2).  That can happen
especially if there are several busy filesystems (remember that sync
traverses filesystems sequentially and waits in WB_SYNC_ALL phase on one
fs before starting it on another fs).  Because WB_SYNC_ALL pass is slow
(e.g.  causes a transaction commit and cache flush for each inode in
ext3), resulting sync(2) times are rather large.

The following script reproduces the problem:

  function run_writers
  {
    for (( i = 0; i < 10; i++ )); do
      mkdir $1/dir$i
      for (( j = 0; j < 40000; j++ )); do
        dd if=/dev/zero of=$1/dir$i/$j bs=4k count=4 &>/dev/null
      done &
    done
  }

  for dir in "$@"; do
    run_writers $dir
  done

  sleep 40
  time sync

Fix the problem by disregarding inodes dirtied after sync(2) was called
in the WB_SYNC_ALL pass.  To allow for this, sync_inodes_sb() now takes
a time stamp when sync has started which is used for setting up work for
flusher threads.

To give some numbers, when above script is run on two ext4 filesystems
on simple SATA drive, the average sync time from 10 runs is 267.549
seconds with standard deviation 104.799426.  With the patched kernel,
the average sync time from 10 runs is 2.995 seconds with standard
deviation 0.096.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Fengguang Wu <fengguang.wu@intel.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c                | 33 ++++++++++++++++++++++-----------
 fs/sync.c                        | 15 +++++++++------
 fs/xfs/xfs_super.c               |  2 +-
 include/linux/writeback.h        |  2 +-
 include/trace/events/writeback.h |  6 +++---
 5 files changed, 36 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9f4935b8f208..4afdbd6d9678 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -39,13 +39,18 @@
 struct wb_writeback_work {
 	long nr_pages;
 	struct super_block *sb;
-	unsigned long *older_than_this;
+	/*
+	 * Write only inodes dirtied before this time. Don't forget to set
+	 * older_than_this_is_set when you set this.
+	 */
+	unsigned long older_than_this;
 	enum writeback_sync_modes sync_mode;
 	unsigned int tagged_writepages:1;
 	unsigned int for_kupdate:1;
 	unsigned int range_cyclic:1;
 	unsigned int for_background:1;
 	unsigned int for_sync:1;	/* sync(2) WB_SYNC_ALL writeback */
+	unsigned int older_than_this_is_set:1;
 	enum wb_reason reason;		/* why was writeback initiated? */
 
 	struct list_head list;		/* pending work list */
@@ -246,10 +251,10 @@ static int move_expired_inodes(struct list_head *delaying_queue,
 	int do_sb_sort = 0;
 	int moved = 0;
 
+	WARN_ON_ONCE(!work->older_than_this_is_set);
 	while (!list_empty(delaying_queue)) {
 		inode = wb_inode(delaying_queue->prev);
-		if (work->older_than_this &&
-		    inode_dirtied_after(inode, *work->older_than_this))
+		if (inode_dirtied_after(inode, work->older_than_this))
 			break;
 		list_move(&inode->i_wb_list, &tmp);
 		moved++;
@@ -733,6 +738,8 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
 		.sync_mode	= WB_SYNC_NONE,
 		.range_cyclic	= 1,
 		.reason		= reason,
+		.older_than_this = jiffies,
+		.older_than_this_is_set = 1,
 	};
 
 	spin_lock(&wb->list_lock);
@@ -791,12 +798,13 @@ static long wb_writeback(struct bdi_writeback *wb,
 {
 	unsigned long wb_start = jiffies;
 	long nr_pages = work->nr_pages;
-	unsigned long oldest_jif;
 	struct inode *inode;
 	long progress;
 
-	oldest_jif = jiffies;
-	work->older_than_this = &oldest_jif;
+	if (!work->older_than_this_is_set) {
+		work->older_than_this = jiffies;
+		work->older_than_this_is_set = 1;
+	}
 
 	spin_lock(&wb->list_lock);
 	for (;;) {
@@ -830,10 +838,10 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 * safe.
 		 */
 		if (work->for_kupdate) {
-			oldest_jif = jiffies -
+			work->older_than_this = jiffies -
 				msecs_to_jiffies(dirty_expire_interval * 10);
 		} else if (work->for_background)
-			oldest_jif = jiffies;
+			work->older_than_this = jiffies;
 
 		trace_writeback_start(wb->bdi, work);
 		if (list_empty(&wb->b_io))
@@ -1345,18 +1353,21 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
 
 /**
  * sync_inodes_sb	-	sync sb inode pages
- * @sb: the superblock
+ * @sb:			the superblock
+ * @older_than_this:	timestamp
  *
  * This function writes and waits on any dirty inode belonging to this
- * super_block.
+ * superblock that has been dirtied before given timestamp.
  */
-void sync_inodes_sb(struct super_block *sb)
+void sync_inodes_sb(struct super_block *sb, unsigned long older_than_this)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct wb_writeback_work work = {
 		.sb		= sb,
 		.sync_mode	= WB_SYNC_ALL,
 		.nr_pages	= LONG_MAX,
+		.older_than_this = older_than_this,
+		.older_than_this_is_set = 1,
 		.range_cyclic	= 0,
 		.done		= &done,
 		.reason		= WB_REASON_SYNC,
diff --git a/fs/sync.c b/fs/sync.c
index 905f3f6b3d85..ff96f99fef64 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -27,10 +27,11 @@
  * wait == 1 case since in that case write_inode() functions do
  * sync_dirty_buffer() and thus effectively write one block at a time.
  */
-static int __sync_filesystem(struct super_block *sb, int wait)
+static int __sync_filesystem(struct super_block *sb, int wait,
+			     unsigned long start)
 {
 	if (wait)
-		sync_inodes_sb(sb);
+		sync_inodes_sb(sb, start);
 	else
 		writeback_inodes_sb(sb, WB_REASON_SYNC);
 
@@ -47,6 +48,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
 int sync_filesystem(struct super_block *sb)
 {
 	int ret;
+	unsigned long start = jiffies;
 
 	/*
 	 * We need to be protected against the filesystem going from
@@ -60,17 +62,17 @@ int sync_filesystem(struct super_block *sb)
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
-	ret = __sync_filesystem(sb, 0);
+	ret = __sync_filesystem(sb, 0, start);
 	if (ret < 0)
 		return ret;
-	return __sync_filesystem(sb, 1);
+	return __sync_filesystem(sb, 1, start);
 }
 EXPORT_SYMBOL_GPL(sync_filesystem);
 
 static void sync_inodes_one_sb(struct super_block *sb, void *arg)
 {
 	if (!(sb->s_flags & MS_RDONLY))
-		sync_inodes_sb(sb);
+		sync_inodes_sb(sb, *((unsigned long *)arg));
 }
 
 static void sync_fs_one_sb(struct super_block *sb, void *arg)
@@ -102,9 +104,10 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
 SYSCALL_DEFINE0(sync)
 {
 	int nowait = 0, wait = 1;
+	unsigned long start = jiffies;
 
 	wakeup_flusher_threads(0, WB_REASON_SYNC);
-	iterate_supers(sync_inodes_one_sb, NULL);
+	iterate_supers(sync_inodes_one_sb, &start);
 	iterate_supers(sync_fs_one_sb, &nowait);
 	iterate_supers(sync_fs_one_sb, &wait);
 	iterate_bdevs(fdatawrite_one_bdev, NULL);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 15188cc99449..8968f5036fa1 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -918,7 +918,7 @@ xfs_flush_inodes(
 	struct super_block	*sb = mp->m_super;
 
 	if (down_read_trylock(&sb->s_umount)) {
-		sync_inodes_sb(sb);
+		sync_inodes_sb(sb, jiffies);
 		up_read(&sb->s_umount);
 	}
 }
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 021b8a319b9e..fc0e4320aa6d 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -97,7 +97,7 @@ void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
 int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 				  enum wb_reason reason);
-void sync_inodes_sb(struct super_block *);
+void sync_inodes_sb(struct super_block *sb, unsigned long older_than_this);
 void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
 void inode_wait_for_writeback(struct inode *inode);
 
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 464ea82e10db..c7bbbe794e65 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -287,11 +287,11 @@ TRACE_EVENT(writeback_queue_io,
 		__field(int,		reason)
 	),
 	TP_fast_assign(
-		unsigned long *older_than_this = work->older_than_this;
+		unsigned long older_than_this = work->older_than_this;
 		strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
-		__entry->older	= older_than_this ?  *older_than_this : 0;
+		__entry->older	= older_than_this;
 		__entry->age	= older_than_this ?
-				  (jiffies - *older_than_this) * 1000 / HZ : -1;
+				  (jiffies - older_than_this) * 1000 / HZ : -1;
 		__entry->moved	= moved;
 		__entry->reason	= work->reason;
 	),
-- 
cgit v1.3


From 00619bcc44d6b779aa366130b354153c222e4380 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Tue, 12 Nov 2013 15:08:31 -0800
Subject: mm: factor commit limit calculation

The same calculation is currently done in three differents places.
Factor that code so future changes has to be made at only one place.

[akpm@linux-foundation.org: uninline vm_commit_limit()]
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/meminfo.c    |  5 +----
 include/linux/mman.h |  2 ++
 mm/mmap.c            |  4 +---
 mm/nommu.c           |  3 +--
 mm/util.c            | 13 +++++++++++++
 5 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 59d85d608898..c805d5b69ba1 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -24,7 +24,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 {
 	struct sysinfo i;
 	unsigned long committed;
-	unsigned long allowed;
 	struct vmalloc_info vmi;
 	long cached;
 	unsigned long pages[NR_LRU_LISTS];
@@ -37,8 +36,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	si_meminfo(&i);
 	si_swapinfo(&i);
 	committed = percpu_counter_read_positive(&vm_committed_as);
-	allowed = ((totalram_pages - hugetlb_total_pages())
-		* sysctl_overcommit_ratio / 100) + total_swap_pages;
 
 	cached = global_page_state(NR_FILE_PAGES) -
 			total_swapcache_pages() - i.bufferram;
@@ -147,7 +144,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(global_page_state(NR_UNSTABLE_NFS)),
 		K(global_page_state(NR_BOUNCE)),
 		K(global_page_state(NR_WRITEBACK_TEMP)),
-		K(allowed),
+		K(vm_commit_limit()),
 		K(committed),
 		(unsigned long)VMALLOC_TOTAL >> 10,
 		vmi.used >> 10,
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 92dc257251e4..7f7f8dae4b1d 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -87,4 +87,6 @@ calc_vm_flag_bits(unsigned long flags)
 	       _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
 	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    );
 }
+
+unsigned long vm_commit_limit(void);
 #endif /* _LINUX_MMAN_H */
diff --git a/mm/mmap.c b/mm/mmap.c
index 3d3e224be771..803048e9c568 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -179,14 +179,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 		goto error;
 	}
 
-	allowed = (totalram_pages - hugetlb_total_pages())
-	       	* sysctl_overcommit_ratio / 100;
+	allowed = vm_commit_limit();
 	/*
 	 * Reserve some for root
 	 */
 	if (!cap_sys_admin)
 		allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-	allowed += total_swap_pages;
 
 	/*
 	 * Don't let a single process grow so big a user can't recover
diff --git a/mm/nommu.c b/mm/nommu.c
index ecd1f158548e..d8a957bb9e31 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1948,13 +1948,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 		goto error;
 	}
 
-	allowed = totalram_pages * sysctl_overcommit_ratio / 100;
+	allowed = vm_commit_limit();
 	/*
 	 * Reserve some 3% for root
 	 */
 	if (!cap_sys_admin)
 		allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-	allowed += total_swap_pages;
 
 	/*
 	 * Don't let a single process grow so big a user can't recover
diff --git a/mm/util.c b/mm/util.c
index eaf63fc2c92f..f7bc2096071c 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -7,6 +7,9 @@
 #include <linux/security.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/mman.h>
+#include <linux/hugetlb.h>
+
 #include <asm/uaccess.h>
 
 #include "internal.h"
@@ -398,6 +401,16 @@ struct address_space *page_mapping(struct page *page)
 	return mapping;
 }
 
+/*
+ * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
+ */
+unsigned long vm_commit_limit(void)
+{
+	return ((totalram_pages - hugetlb_total_pages())
+		* sysctl_overcommit_ratio / 100) + total_swap_pages;
+}
+
+
 /* Tracepoints definitions. */
 EXPORT_TRACEPOINT_SYMBOL(kmalloc);
 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
-- 
cgit v1.3


From 54886a7153353ea2bf21ebfc1b8e030e71d151d7 Mon Sep 17 00:00:00 2001
From: Michael Opdenacker <michael.opdenacker@free-electrons.com>
Date: Tue, 12 Nov 2013 15:08:35 -0800
Subject: cramfs: mark as obsolete

Who needs cramfs when you have squashfs?  At least, we should warn people
that cramfs is obsolete.

Signed-off-by: Michael Opdenacker <michael.opdenacker@free-electrons.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS       | 2 +-
 fs/cramfs/Kconfig | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/MAINTAINERS b/MAINTAINERS
index 051e4dc5b70f..f5867f469f13 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2373,7 +2373,7 @@ F:	kernel/cpuset.c
 
 CRAMFS FILESYSTEM
 W:	http://sourceforge.net/projects/cramfs/
-S:	Orphan
+S:	Orphan / Obsolete
 F:	Documentation/filesystems/cramfs.txt
 F:	fs/cramfs/
 
diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig
index cd06466f365e..11b29d491b7c 100644
--- a/fs/cramfs/Kconfig
+++ b/fs/cramfs/Kconfig
@@ -1,5 +1,5 @@
 config CRAMFS
-	tristate "Compressed ROM file system support (cramfs)"
+	tristate "Compressed ROM file system support (cramfs) (OBSOLETE)"
 	depends on BLOCK
 	select ZLIB_INFLATE
 	help
@@ -16,4 +16,7 @@ config CRAMFS
 	  cramfs.  Note that the root file system (the one containing the
 	  directory /) cannot be compiled as a module.
 
+	  This filesystem is obsoleted by SquashFS, which is much better
+	  in terms of performance and features.
+
 	  If unsure, say N.
-- 
cgit v1.3


From 6bc080d8fdae33f4463203a400cfaa01e91701e2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 12 Nov 2013 15:10:04 -0800
Subject: debugfs: use list_next_entry() in debugfs_remove_recursive()

Change debugfs_remove_recursive() to use list_next_entry(child), no
changes in generated code.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Eilon Greenstein <eilong@broadcom.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/debugfs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index c7c83ff0f752..9c0444cccbe1 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -566,8 +566,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
 	mutex_lock(&parent->d_inode->i_mutex);
 
 	if (child != dentry) {
-		next = list_entry(child->d_u.d_child.next, struct dentry,
-					d_u.d_child);
+		next = list_next_entry(child, d_u.d_child);
 		goto up;
 	}
 
-- 
cgit v1.3


From ae10b2b4eb01bedc91d29d5c5bb9e416fd806c40 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Tue, 12 Nov 2013 15:10:16 -0800
Subject: epoll: optimize EPOLL_CTL_DEL using rcu

Nathan Zimmer found that once we get over 10+ cpus, the scalability of
SPECjbb falls over due to the contention on the global 'epmutex', which is
taken in on EPOLL_CTL_ADD and EPOLL_CTL_DEL operations.

Patch #1 removes the 'epmutex' lock completely from the EPOLL_CTL_DEL path
by using rcu to guard against any concurrent traversals.

Patch #2 remove the 'epmutex' lock from EPOLL_CTL_ADD operations for
simple topologies.  IE when adding a link from an epoll file descriptor to
a wakeup source, where the epoll file descriptor is not nested.

This patch (of 2):

Optimize EPOLL_CTL_DEL such that it does not require the 'epmutex' by
converting the file->f_ep_links list into an rcu one.  In this way, we can
traverse the epoll network on the add path in parallel with deletes.
Since deletes can't create loops or worse wakeup paths, this is safe.

This patch in combination with the patch "epoll: Do not take global 'epmutex'
for simple topologies", shows a dramatic performance improvement in
scalability for SPECjbb.

Signed-off-by: Jason Baron <jbaron@akamai.com>
Tested-by: Nathan Zimmer <nzimmer@sgi.com>
Cc: Eric Wong <normalperson@yhbt.net>
Cc: Nelson Elhage <nelhage@nelhage.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
CC: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 56 ++++++++++++++++++++++++++++++++------------------------
 1 file changed, 32 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 810c28fb8c3c..584249454822 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -41,6 +41,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/compat.h>
+#include <linux/rculist.h>
 
 /*
  * LOCKING:
@@ -133,8 +134,12 @@ struct nested_calls {
  * of these on a server and we do not want this to take another cache line.
  */
 struct epitem {
-	/* RB tree node used to link this structure to the eventpoll RB tree */
-	struct rb_node rbn;
+	union {
+		/* RB tree node links this structure to the eventpoll RB tree */
+		struct rb_node rbn;
+		/* Used to free the struct epitem */
+		struct rcu_head rcu;
+	};
 
 	/* List header used to link this structure to the eventpoll ready list */
 	struct list_head rdllink;
@@ -671,6 +676,12 @@ static int ep_scan_ready_list(struct eventpoll *ep,
 	return error;
 }
 
+static void epi_rcu_free(struct rcu_head *head)
+{
+	struct epitem *epi = container_of(head, struct epitem, rcu);
+	kmem_cache_free(epi_cache, epi);
+}
+
 /*
  * Removes a "struct epitem" from the eventpoll RB tree and deallocates
  * all the associated resources. Must be called with "mtx" held.
@@ -692,8 +703,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 
 	/* Remove the current item from the list of epoll hooks */
 	spin_lock(&file->f_lock);
-	if (ep_is_linked(&epi->fllink))
-		list_del_init(&epi->fllink);
+	list_del_rcu(&epi->fllink);
 	spin_unlock(&file->f_lock);
 
 	rb_erase(&epi->rbn, &ep->rbr);
@@ -704,9 +714,14 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 	spin_unlock_irqrestore(&ep->lock, flags);
 
 	wakeup_source_unregister(ep_wakeup_source(epi));
-
-	/* At this point it is safe to free the eventpoll item */
-	kmem_cache_free(epi_cache, epi);
+	/*
+	 * At this point it is safe to free the eventpoll item. Use the union
+	 * field epi->rcu, since we are trying to minimize the size of
+	 * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
+	 * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
+	 * use of the rbn field.
+	 */
+	call_rcu(&epi->rcu, epi_rcu_free);
 
 	atomic_long_dec(&ep->user->epoll_watches);
 
@@ -872,7 +887,6 @@ static const struct file_operations eventpoll_fops = {
  */
 void eventpoll_release_file(struct file *file)
 {
-	struct list_head *lsthead = &file->f_ep_links;
 	struct eventpoll *ep;
 	struct epitem *epi;
 
@@ -890,17 +904,12 @@ void eventpoll_release_file(struct file *file)
 	 * Besides, ep_remove() acquires the lock, so we can't hold it here.
 	 */
 	mutex_lock(&epmutex);
-
-	while (!list_empty(lsthead)) {
-		epi = list_first_entry(lsthead, struct epitem, fllink);
-
+	list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
 		ep = epi->ep;
-		list_del_init(&epi->fllink);
 		mutex_lock_nested(&ep->mtx, 0);
 		ep_remove(ep, epi);
 		mutex_unlock(&ep->mtx);
 	}
-
 	mutex_unlock(&epmutex);
 }
 
@@ -1138,7 +1147,9 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
 	struct file *child_file;
 	struct epitem *epi;
 
-	list_for_each_entry(epi, &file->f_ep_links, fllink) {
+	/* CTL_DEL can remove links here, but that can't increase our count */
+	rcu_read_lock();
+	list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
 		child_file = epi->ep->file;
 		if (is_file_epoll(child_file)) {
 			if (list_empty(&child_file->f_ep_links)) {
@@ -1160,6 +1171,7 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
 				"file is not an ep!\n");
 		}
 	}
+	rcu_read_unlock();
 	return error;
 }
 
@@ -1286,7 +1298,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
 	/* Add the current item to the list of active epoll hook for this file */
 	spin_lock(&tfile->f_lock);
-	list_add_tail(&epi->fllink, &tfile->f_ep_links);
+	list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
 	spin_unlock(&tfile->f_lock);
 
 	/*
@@ -1327,8 +1339,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
 error_remove_epi:
 	spin_lock(&tfile->f_lock);
-	if (ep_is_linked(&epi->fllink))
-		list_del_init(&epi->fllink);
+	list_del_rcu(&epi->fllink);
 	spin_unlock(&tfile->f_lock);
 
 	rb_erase(&epi->rbn, &ep->rbr);
@@ -1844,15 +1855,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	 * and hang them on the tfile_check_list, so we can check that we
 	 * haven't created too many possible wakeup paths.
 	 *
-	 * We need to hold the epmutex across both ep_insert and ep_remove
-	 * b/c we want to make sure we are looking at a coherent view of
-	 * epoll network.
+	 * We need to hold the epmutex across ep_insert to prevent
+	 * multple adds from creating loops in parallel.
 	 */
-	if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
+	if (op == EPOLL_CTL_ADD) {
 		mutex_lock(&epmutex);
 		did_lock_epmutex = 1;
-	}
-	if (op == EPOLL_CTL_ADD) {
 		if (is_file_epoll(tf.file)) {
 			error = -ELOOP;
 			if (ep_loop_check(ep, tf.file) != 0) {
-- 
cgit v1.3


From 67347fe4e6326338ee217d7eb826bedf30b2e155 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Tue, 12 Nov 2013 15:10:18 -0800
Subject: epoll: do not take global 'epmutex' for simple topologies

When calling EPOLL_CTL_ADD for an epoll file descriptor that is attached
directly to a wakeup source, we do not need to take the global 'epmutex',
unless the epoll file descriptor is nested.  The purpose of taking the
'epmutex' on add is to prevent complex topologies such as loops and deep
wakeup paths from forming in parallel through multiple EPOLL_CTL_ADD
operations.  However, for the simple case of an epoll file descriptor
attached directly to a wakeup source (with no nesting), we do not need to
hold the 'epmutex'.

This patch along with 'epoll: optimize EPOLL_CTL_DEL using rcu' improves
scalability on larger systems.  Quoting Nathan Zimmer's mail on SPECjbb
performance:

"On the 16 socket run the performance went from 35k jOPS to 125k jOPS.  In
addition the benchmark when from scaling well on 10 sockets to scaling
well on just over 40 sockets.

...

Currently the benchmark stops scaling at around 40-44 sockets but it seems like
I found a second unrelated bottleneck."

[akpm@linux-foundation.org: use `bool' for boolean variables, remove unneeded/undesirable cast of void*, add missed ep_scan_ready_list() kerneldoc]
Signed-off-by: Jason Baron <jbaron@akamai.com>
Tested-by: Nathan Zimmer <nzimmer@sgi.com>
Cc: Eric Wong <normalperson@yhbt.net>
Cc: Nelson Elhage <nelhage@nelhage.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 95 ++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 69 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 584249454822..f7fe7e3ce664 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -585,14 +585,14 @@ static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
  * @sproc: Pointer to the scan callback.
  * @priv: Private opaque data passed to the @sproc callback.
  * @depth: The current depth of recursive f_op->poll calls.
+ * @ep_locked: caller already holds ep->mtx
  *
  * Returns: The same integer error code returned by the @sproc callback.
  */
 static int ep_scan_ready_list(struct eventpoll *ep,
 			      int (*sproc)(struct eventpoll *,
 					   struct list_head *, void *),
-			      void *priv,
-			      int depth)
+			      void *priv, int depth, bool ep_locked)
 {
 	int error, pwake = 0;
 	unsigned long flags;
@@ -603,7 +603,9 @@ static int ep_scan_ready_list(struct eventpoll *ep,
 	 * We need to lock this because we could be hit by
 	 * eventpoll_release_file() and epoll_ctl().
 	 */
-	mutex_lock_nested(&ep->mtx, depth);
+
+	if (!ep_locked)
+		mutex_lock_nested(&ep->mtx, depth);
 
 	/*
 	 * Steal the ready list, and re-init the original one to the
@@ -667,7 +669,8 @@ static int ep_scan_ready_list(struct eventpoll *ep,
 	}
 	spin_unlock_irqrestore(&ep->lock, flags);
 
-	mutex_unlock(&ep->mtx);
+	if (!ep_locked)
+		mutex_unlock(&ep->mtx);
 
 	/* We have to call this outside the lock */
 	if (pwake)
@@ -822,15 +825,34 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
 	return 0;
 }
 
+static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
+				 poll_table *pt);
+
+struct readyevents_arg {
+	struct eventpoll *ep;
+	bool locked;
+};
+
 static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
 {
-	return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1);
+	struct readyevents_arg *arg = priv;
+
+	return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL,
+				  call_nests + 1, arg->locked);
 }
 
 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 {
 	int pollflags;
 	struct eventpoll *ep = file->private_data;
+	struct readyevents_arg arg;
+
+	/*
+	 * During ep_insert() we already hold the ep->mtx for the tfile.
+	 * Prevent re-aquisition.
+	 */
+	arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc);
+	arg.ep = ep;
 
 	/* Insert inside our poll wait queue */
 	poll_wait(file, &ep->poll_wait, wait);
@@ -842,7 +864,7 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 	 * could re-enter here.
 	 */
 	pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
-				   ep_poll_readyevents_proc, ep, ep, current);
+				   ep_poll_readyevents_proc, &arg, ep, current);
 
 	return pollflags != -1 ? pollflags : 0;
 }
@@ -1243,7 +1265,7 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi)
  * Must be called with "mtx" held.
  */
 static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
-		     struct file *tfile, int fd)
+		     struct file *tfile, int fd, int full_check)
 {
 	int error, revents, pwake = 0;
 	unsigned long flags;
@@ -1309,7 +1331,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
 	/* now check if we've created too many backpaths */
 	error = -EINVAL;
-	if (reverse_path_check())
+	if (full_check && reverse_path_check())
 		goto error_remove_epi;
 
 	/* We have to drop the new item inside our item list to keep track of it */
@@ -1532,7 +1554,7 @@ static int ep_send_events(struct eventpoll *ep,
 	esed.maxevents = maxevents;
 	esed.events = events;
 
-	return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
+	return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
 }
 
 static inline struct timespec ep_set_mstimeout(long ms)
@@ -1802,11 +1824,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 		struct epoll_event __user *, event)
 {
 	int error;
-	int did_lock_epmutex = 0;
+	int full_check = 0;
 	struct fd f, tf;
 	struct eventpoll *ep;
 	struct epitem *epi;
 	struct epoll_event epds;
+	struct eventpoll *tep = NULL;
 
 	error = -EFAULT;
 	if (ep_op_has_event(op) &&
@@ -1855,23 +1878,40 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	 * and hang them on the tfile_check_list, so we can check that we
 	 * haven't created too many possible wakeup paths.
 	 *
-	 * We need to hold the epmutex across ep_insert to prevent
-	 * multple adds from creating loops in parallel.
+	 * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
+	 * the epoll file descriptor is attaching directly to a wakeup source,
+	 * unless the epoll file descriptor is nested. The purpose of taking the
+	 * 'epmutex' on add is to prevent complex toplogies such as loops and
+	 * deep wakeup paths from forming in parallel through multiple
+	 * EPOLL_CTL_ADD operations.
 	 */
+	mutex_lock_nested(&ep->mtx, 0);
 	if (op == EPOLL_CTL_ADD) {
-		mutex_lock(&epmutex);
-		did_lock_epmutex = 1;
-		if (is_file_epoll(tf.file)) {
-			error = -ELOOP;
-			if (ep_loop_check(ep, tf.file) != 0) {
-				clear_tfile_check_list();
-				goto error_tgt_fput;
+		if (!list_empty(&f.file->f_ep_links) ||
+						is_file_epoll(tf.file)) {
+			full_check = 1;
+			mutex_unlock(&ep->mtx);
+			mutex_lock(&epmutex);
+			if (is_file_epoll(tf.file)) {
+				error = -ELOOP;
+				if (ep_loop_check(ep, tf.file) != 0) {
+					clear_tfile_check_list();
+					goto error_tgt_fput;
+				}
+			} else
+				list_add(&tf.file->f_tfile_llink,
+							&tfile_check_list);
+			mutex_lock_nested(&ep->mtx, 0);
+			if (is_file_epoll(tf.file)) {
+				tep = tf.file->private_data;
+				mutex_lock_nested(&tep->mtx, 1);
 			}
-		} else
-			list_add(&tf.file->f_tfile_llink, &tfile_check_list);
+		}
+	}
+	if (op == EPOLL_CTL_DEL && is_file_epoll(tf.file)) {
+		tep = tf.file->private_data;
+		mutex_lock_nested(&tep->mtx, 1);
 	}
-
-	mutex_lock_nested(&ep->mtx, 0);
 
 	/*
 	 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
@@ -1885,10 +1925,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	case EPOLL_CTL_ADD:
 		if (!epi) {
 			epds.events |= POLLERR | POLLHUP;
-			error = ep_insert(ep, &epds, tf.file, fd);
+			error = ep_insert(ep, &epds, tf.file, fd, full_check);
 		} else
 			error = -EEXIST;
-		clear_tfile_check_list();
+		if (full_check)
+			clear_tfile_check_list();
 		break;
 	case EPOLL_CTL_DEL:
 		if (epi)
@@ -1904,10 +1945,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 			error = -ENOENT;
 		break;
 	}
+	if (tep != NULL)
+		mutex_unlock(&tep->mtx);
 	mutex_unlock(&ep->mtx);
 
 error_tgt_fput:
-	if (did_lock_epmutex)
+	if (full_check)
 		mutex_unlock(&epmutex);
 
 	fdput(tf);
-- 
cgit v1.3


From 74a797d99a9fa4871ca978ac65e0f3a90dcf84d8 Mon Sep 17 00:00:00 2001
From: Michael Opdenacker <michael.opdenacker@free-electrons.com>
Date: Tue, 12 Nov 2013 15:11:06 -0800
Subject: fs/hfs/btree.h: remove duplicate defines

This patch removes duplicate defines from fs/hfs/btree.h

[akpm@linux-foundation.org: retain the comments]
Signed-off-by: Michael Opdenacker <michael.opdenacker@free-electrons.com>
Reviewed-by: Vyacheslav Dubeyko <slava@dubeyko.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfs/btree.h           | 5 -----
 fs/hfsplus/hfsplus_raw.h | 8 ++++----
 2 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index 2a1d712f85dc..f6bd266d70b5 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -153,11 +153,6 @@ struct hfs_btree_header_rec {
 	u32 reserved3[16];
 } __packed;
 
-#define HFS_NODE_INDEX	0x00	/* An internal (index) node */
-#define HFS_NODE_HEADER	0x01	/* The tree header node (node 0) */
-#define HFS_NODE_MAP		0x02	/* Holds part of the bitmap of used nodes */
-#define HFS_NODE_LEAF		0xFF	/* A leaf (ndNHeight==1) node */
-
 #define BTREE_ATTR_BADCLOSE	0x00000001	/* b-tree not closed properly. not
 						   used by hfsplus. */
 #define HFS_TREE_BIGKEYS	0x00000002	/* key length is u16 instead of u8.
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index 452ede01b036..2c54dd81def4 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -156,10 +156,10 @@ struct hfs_bnode_desc {
 } __packed;
 
 /* HFS+ BTree node types */
-#define HFS_NODE_INDEX	0x00
-#define HFS_NODE_HEADER	0x01
-#define HFS_NODE_MAP	0x02
-#define HFS_NODE_LEAF	0xFF
+#define HFS_NODE_INDEX	0x00	/* An internal (index) node */
+#define HFS_NODE_HEADER	0x01	/* The tree header node (node 0) */
+#define HFS_NODE_MAP	0x02	/* Holds part of the bitmap of used nodes */
+#define HFS_NODE_LEAF	0xFF	/* A leaf (ndNHeight==1) node */
 
 /* HFS+ BTree header */
 struct hfs_btree_header_rec {
-- 
cgit v1.3


From b3b5b0f03cc4ea074ed0ac110d0afd17e0ccdf9e Mon Sep 17 00:00:00 2001
From: Vyacheslav Dubeyko <slava@dubeyko.com>
Date: Tue, 12 Nov 2013 15:11:07 -0800
Subject: hfsplus: add metadata file's clump size calculation functionality

There are situation when HFS+ volume had been created without
AttributesFile.  Such situation can take place because of using old
mkfs.hfs utility or creation HFS+ volume without taking in mind
necessity to use xattrs.  For example, Mac OS X 10.4 (Tiger) doesn't
create AttributesFile during mkfs phase.  Also it is a very frequent
situation for the case of users that created HFS+ volumes under Linux.
As a result, xattrs and POSIX ACLs on HFS+ volume are unavailable for
such users.

This patchset implements functionality of AttributesFile creation on
HFS+ volume in the case of this metadata file absence during operation
of xattr creation.

This patch:

Add functionality of metadata file's clump size calculation.  Operation
of AttributesFile creation needs in clump size setting.  This value will
be used when AttributesFile will be extended.

This code is adopted from code of newfs_hfs utility of diskdev_cmds packet
http://opensource.apple.com/tarballs/diskdev_cmds/.

Signed-off-by: Vyacheslav Dubeyko <slava@dubeyko.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hin-Tak Leung <htl10@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/btree.c      | 112 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/hfsplus/hfsplus_fs.h |   1 +
 2 files changed, 113 insertions(+)

(limited to 'fs')

diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 0c6540c91167..0fcec8b2a90b 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -15,6 +15,118 @@
 #include "hfsplus_fs.h"
 #include "hfsplus_raw.h"
 
+/*
+ * Initial source code of clump size calculation is gotten
+ * from http://opensource.apple.com/tarballs/diskdev_cmds/
+ */
+#define CLUMP_ENTRIES	15
+
+static short clumptbl[CLUMP_ENTRIES * 3] = {
+/*
+ *	    Volume	Attributes	 Catalog	 Extents
+ *	     Size	Clump (MB)	Clump (MB)	Clump (MB)
+ */
+	/*   1GB */	  4,		  4,		 4,
+	/*   2GB */	  6,		  6,		 4,
+	/*   4GB */	  8,		  8,		 4,
+	/*   8GB */	 11,		 11,		 5,
+	/*
+	 * For volumes 16GB and larger, we want to make sure that a full OS
+	 * install won't require fragmentation of the Catalog or Attributes
+	 * B-trees.  We do this by making the clump sizes sufficiently large,
+	 * and by leaving a gap after the B-trees for them to grow into.
+	 *
+	 * For SnowLeopard 10A298, a FullNetInstall with all packages selected
+	 * results in:
+	 * Catalog B-tree Header
+	 *	nodeSize:          8192
+	 *	totalNodes:       31616
+	 *	freeNodes:         1978
+	 * (used = 231.55 MB)
+	 * Attributes B-tree Header
+	 *	nodeSize:          8192
+	 *	totalNodes:       63232
+	 *	freeNodes:          958
+	 * (used = 486.52 MB)
+	 *
+	 * We also want Time Machine backup volumes to have a sufficiently
+	 * large clump size to reduce fragmentation.
+	 *
+	 * The series of numbers for Catalog and Attribute form a geometric
+	 * series. For Catalog (16GB to 512GB), each term is 8**(1/5) times
+	 * the previous term.  For Attributes (16GB to 512GB), each term is
+	 * 4**(1/5) times the previous term.  For 1TB to 16TB, each term is
+	 * 2**(1/5) times the previous term.
+	 */
+	/*  16GB */	 64,		 32,		 5,
+	/*  32GB */	 84,		 49,		 6,
+	/*  64GB */	111,		 74,		 7,
+	/* 128GB */	147,		111,		 8,
+	/* 256GB */	194,		169,		 9,
+	/* 512GB */	256,		256,		11,
+	/*   1TB */	294,		294,		14,
+	/*   2TB */	338,		338,		16,
+	/*   4TB */	388,		388,		20,
+	/*   8TB */	446,		446,		25,
+	/*  16TB */	512,		512,		32
+};
+
+u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size,
+					u64 sectors, int file_id)
+{
+	u32 mod = max(node_size, block_size);
+	u32 clump_size;
+	int column;
+	int i;
+
+	/* Figure out which column of the above table to use for this file. */
+	switch (file_id) {
+	case HFSPLUS_ATTR_CNID:
+		column = 0;
+		break;
+	case HFSPLUS_CAT_CNID:
+		column = 1;
+		break;
+	default:
+		column = 2;
+		break;
+	}
+
+	/*
+	 * The default clump size is 0.8% of the volume size. And
+	 * it must also be a multiple of the node and block size.
+	 */
+	if (sectors < 0x200000) {
+		clump_size = sectors << 2;	/*  0.8 %  */
+		if (clump_size < (8 * node_size))
+			clump_size = 8 * node_size;
+	} else {
+		/* turn exponent into table index... */
+		for (i = 0, sectors = sectors >> 22;
+		     sectors && (i < CLUMP_ENTRIES - 1);
+		     ++i, sectors = sectors >> 1) {
+			/* empty body */
+		}
+
+		clump_size = clumptbl[column + (i) * 3] * 1024 * 1024;
+	}
+
+	/*
+	 * Round the clump size to a multiple of node and block size.
+	 * NOTE: This rounds down.
+	 */
+	clump_size /= mod;
+	clump_size *= mod;
+
+	/*
+	 * Rounding down could have rounded down to 0 if the block size was
+	 * greater than the clump size.  If so, just use one block or node.
+	 */
+	if (clump_size == 0)
+		clump_size = mod;
+
+	return clump_size;
+}
 
 /* Get a reference to a B*Tree and do some initial checks */
 struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 2b9cd01696e2..1e36f18e904b 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -380,6 +380,7 @@ int hfsplus_block_allocate(struct super_block *, u32, u32, u32 *);
 int hfsplus_block_free(struct super_block *, u32, u32);
 
 /* btree.c */
+u32 hfsplus_calc_btree_clump_size(u32, u32, u64, int);
 struct hfs_btree *hfs_btree_open(struct super_block *, u32);
 void hfs_btree_close(struct hfs_btree *);
 int hfs_btree_write(struct hfs_btree *);
-- 
cgit v1.3


From 099e9245e04d50bb12ed621b4fa61df0a4c9dba9 Mon Sep 17 00:00:00 2001
From: Vyacheslav Dubeyko <slava@dubeyko.com>
Date: Tue, 12 Nov 2013 15:11:08 -0800
Subject: hfsplus: implement attributes file's header node initialization code

Implement functionality of AttributesFile's header node initialization.

Signed-off-by: Vyacheslav Dubeyko <slava@dubeyko.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hin-Tak Leung <htl10@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/hfsplus_raw.h |  3 +++
 fs/hfsplus/xattr.c       | 65 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

(limited to 'fs')

diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index 2c54dd81def4..8ffb3a8ffe75 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -187,6 +187,9 @@ struct hfs_btree_header_rec {
 /* HFS+ BTree misc info */
 #define HFSPLUS_TREE_HEAD 0
 #define HFSPLUS_NODE_MXSZ 32768
+#define HFSPLUS_ATTR_TREE_NODE_SIZE		8192
+#define HFSPLUS_BTREE_HDR_NODE_RECS_COUNT	3
+#define HFSPLUS_BTREE_HDR_USER_BYTES		128
 
 /* Some special File ID numbers (stolen from hfs.h) */
 #define HFSPLUS_POR_CNID		1	/* Parent Of the Root */
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index bd8471fb9a6a..568a45cb1145 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -127,6 +127,71 @@ static int can_set_xattr(struct inode *inode, const char *name,
 	return 0;
 }
 
+static void hfsplus_init_header_node(struct inode *attr_file,
+					u32 clump_size,
+					char *buf, size_t node_size)
+{
+	struct hfs_bnode_desc *desc;
+	struct hfs_btree_header_rec *head;
+	u16 offset;
+	__be16 *rec_offsets;
+	u32 hdr_node_map_rec_bits;
+	char *bmp;
+	u32 used_nodes;
+	u32 used_bmp_bytes;
+
+	hfs_dbg(ATTR_MOD, "init_hdr_attr_file: clump %u, node_size %zu\n",
+				clump_size, node_size);
+
+	/* The end of the node contains list of record offsets */
+	rec_offsets = (__be16 *)(buf + node_size);
+
+	desc = (struct hfs_bnode_desc *)buf;
+	desc->type = HFS_NODE_HEADER;
+	desc->num_recs = cpu_to_be16(HFSPLUS_BTREE_HDR_NODE_RECS_COUNT);
+	offset = sizeof(struct hfs_bnode_desc);
+	*--rec_offsets = cpu_to_be16(offset);
+
+	head = (struct hfs_btree_header_rec *)(buf + offset);
+	head->node_size = cpu_to_be16(node_size);
+	head->node_count = cpu_to_be32(i_size_read(attr_file) / node_size);
+	head->free_nodes = cpu_to_be32(be32_to_cpu(head->node_count) - 1);
+	head->clump_size = cpu_to_be32(clump_size);
+	head->attributes |= cpu_to_be32(HFS_TREE_BIGKEYS | HFS_TREE_VARIDXKEYS);
+	head->max_key_len = cpu_to_be16(HFSPLUS_ATTR_KEYLEN - sizeof(u16));
+	offset += sizeof(struct hfs_btree_header_rec);
+	*--rec_offsets = cpu_to_be16(offset);
+	offset += HFSPLUS_BTREE_HDR_USER_BYTES;
+	*--rec_offsets = cpu_to_be16(offset);
+
+	hdr_node_map_rec_bits = 8 * (node_size - offset - (4 * sizeof(u16)));
+	if (be32_to_cpu(head->node_count) > hdr_node_map_rec_bits) {
+		u32 map_node_bits;
+		u32 map_nodes;
+
+		desc->next = cpu_to_be32(be32_to_cpu(head->leaf_tail) + 1);
+		map_node_bits = 8 * (node_size - sizeof(struct hfs_bnode_desc) -
+					(2 * sizeof(u16)) - 2);
+		map_nodes = (be32_to_cpu(head->node_count) -
+				hdr_node_map_rec_bits +
+				(map_node_bits - 1)) / map_node_bits;
+		be32_add_cpu(&head->free_nodes, 0 - map_nodes);
+	}
+
+	bmp = buf + offset;
+	used_nodes =
+		be32_to_cpu(head->node_count) - be32_to_cpu(head->free_nodes);
+	used_bmp_bytes = used_nodes / 8;
+	if (used_bmp_bytes) {
+		memset(bmp, 0xFF, used_bmp_bytes);
+		bmp += used_bmp_bytes;
+		used_nodes %= 8;
+	}
+	*bmp = ~(0xFF >> used_nodes);
+	offset += hdr_node_map_rec_bits / 8;
+	*--rec_offsets = cpu_to_be16(offset);
+}
+
 int __hfsplus_setxattr(struct inode *inode, const char *name,
 			const void *value, size_t size, int flags)
 {
-- 
cgit v1.3


From 95e0d7dbb9b28ab0dfad7c7316066b05e1f1d4cd Mon Sep 17 00:00:00 2001
From: Vyacheslav Dubeyko <slava@dubeyko.com>
Date: Tue, 12 Nov 2013 15:11:09 -0800
Subject: hfsplus: implement attributes file creation functionality

Implement functionality of creation AttributesFile metadata file on HFS+
volume in the case of absence of it.

It makes trying to open AttributesFile's B-tree during mount of HFS+
volume.  If HFS+ volume hasn't AttributesFile then a pointer on
AttributesFile's B-tree keeps as NULL.  Thereby, when it is discovered
absence of AttributesFile on HFS+ volume in the begin of xattr creation
operation then AttributesFile will be created.

The creation of AttributesFile will have success in the case of
availability (2 * clump) free blocks on HFS+ volume.  Otherwise,
creation operation is ended with error (-ENOSPC).

Signed-off-by: Vyacheslav Dubeyko <slava@dubeyko.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Acked-by: Hin-Tak Leung <htl10@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/hfsplus_fs.h |   9 +++
 fs/hfsplus/super.c      |   2 +
 fs/hfsplus/xattr.c      | 142 +++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 151 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 1e36f18e904b..08846425b67f 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -126,6 +126,14 @@ struct hfs_bnode {
 #define HFS_BNODE_DIRTY		3
 #define HFS_BNODE_DELETED	4
 
+/*
+ * Attributes file states
+ */
+#define HFSPLUS_EMPTY_ATTR_TREE		0
+#define HFSPLUS_CREATING_ATTR_TREE	1
+#define HFSPLUS_VALID_ATTR_TREE		2
+#define HFSPLUS_FAILED_ATTR_TREE	3
+
 /*
  * HFS+ superblock info (built from Volume Header on disk)
  */
@@ -141,6 +149,7 @@ struct hfsplus_sb_info {
 	struct hfs_btree *ext_tree;
 	struct hfs_btree *cat_tree;
 	struct hfs_btree *attr_tree;
+	atomic_t attr_tree_state;
 	struct inode *alloc_file;
 	struct inode *hidden_dir;
 	struct nls_table *nls;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 4c4d142cf890..80875aa640ef 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -474,12 +474,14 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		pr_err("failed to load catalog file\n");
 		goto out_close_ext_tree;
 	}
+	atomic_set(&sbi->attr_tree_state, HFSPLUS_EMPTY_ATTR_TREE);
 	if (vhdr->attr_file.total_blocks != 0) {
 		sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID);
 		if (!sbi->attr_tree) {
 			pr_err("failed to load attributes file\n");
 			goto out_close_cat_tree;
 		}
+		atomic_set(&sbi->attr_tree_state, HFSPLUS_VALID_ATTR_TREE);
 	}
 	sb->s_xattr = hfsplus_xattr_handlers;
 
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 568a45cb1145..efc85b1377cc 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -192,6 +192,143 @@ static void hfsplus_init_header_node(struct inode *attr_file,
 	*--rec_offsets = cpu_to_be16(offset);
 }
 
+static int hfsplus_create_attributes_file(struct super_block *sb)
+{
+	int err = 0;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	struct inode *attr_file;
+	struct hfsplus_inode_info *hip;
+	u32 clump_size;
+	u16 node_size = HFSPLUS_ATTR_TREE_NODE_SIZE;
+	char *buf;
+	int index, written;
+	struct address_space *mapping;
+	struct page *page;
+	int old_state = HFSPLUS_EMPTY_ATTR_TREE;
+
+	hfs_dbg(ATTR_MOD, "create_attr_file: ino %d\n", HFSPLUS_ATTR_CNID);
+
+check_attr_tree_state_again:
+	switch (atomic_read(&sbi->attr_tree_state)) {
+	case HFSPLUS_EMPTY_ATTR_TREE:
+		if (old_state != atomic_cmpxchg(&sbi->attr_tree_state,
+						old_state,
+						HFSPLUS_CREATING_ATTR_TREE))
+			goto check_attr_tree_state_again;
+		break;
+	case HFSPLUS_CREATING_ATTR_TREE:
+		/*
+		 * This state means that another thread is in process
+		 * of AttributesFile creation. Theoretically, it is
+		 * possible to be here. But really __setxattr() method
+		 * first of all calls hfs_find_init() for lookup in
+		 * B-tree of CatalogFile. This method locks mutex of
+		 * CatalogFile's B-tree. As a result, if some thread
+		 * is inside AttributedFile creation operation then
+		 * another threads will be waiting unlocking of
+		 * CatalogFile's B-tree's mutex. However, if code will
+		 * change then we will return error code (-EAGAIN) from
+		 * here. Really, it means that first try to set of xattr
+		 * fails with error but second attempt will have success.
+		 */
+		return -EAGAIN;
+	case HFSPLUS_VALID_ATTR_TREE:
+		return 0;
+	case HFSPLUS_FAILED_ATTR_TREE:
+		return -EOPNOTSUPP;
+	default:
+		BUG();
+	}
+
+	attr_file = hfsplus_iget(sb, HFSPLUS_ATTR_CNID);
+	if (IS_ERR(attr_file)) {
+		pr_err("failed to load attributes file\n");
+		return PTR_ERR(attr_file);
+	}
+
+	BUG_ON(i_size_read(attr_file) != 0);
+
+	hip = HFSPLUS_I(attr_file);
+
+	clump_size = hfsplus_calc_btree_clump_size(sb->s_blocksize,
+						    node_size,
+						    sbi->sect_count,
+						    HFSPLUS_ATTR_CNID);
+
+	mutex_lock(&hip->extents_lock);
+	hip->clump_blocks = clump_size >> sbi->alloc_blksz_shift;
+	mutex_unlock(&hip->extents_lock);
+
+	if (sbi->free_blocks <= (hip->clump_blocks << 1)) {
+		err = -ENOSPC;
+		goto end_attr_file_creation;
+	}
+
+	while (hip->alloc_blocks < hip->clump_blocks) {
+		err = hfsplus_file_extend(attr_file);
+		if (unlikely(err)) {
+			pr_err("failed to extend attributes file\n");
+			goto end_attr_file_creation;
+		}
+		hip->phys_size = attr_file->i_size =
+			(loff_t)hip->alloc_blocks << sbi->alloc_blksz_shift;
+		hip->fs_blocks = hip->alloc_blocks << sbi->fs_shift;
+		inode_set_bytes(attr_file, attr_file->i_size);
+	}
+
+	buf = kzalloc(node_size, GFP_NOFS);
+	if (!buf) {
+		pr_err("failed to allocate memory for header node\n");
+		err = -ENOMEM;
+		goto end_attr_file_creation;
+	}
+
+	hfsplus_init_header_node(attr_file, clump_size, buf, node_size);
+
+	mapping = attr_file->i_mapping;
+
+	index = 0;
+	written = 0;
+	for (; written < node_size; index++, written += PAGE_CACHE_SIZE) {
+		void *kaddr;
+
+		page = read_mapping_page(mapping, index, NULL);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto failed_header_node_init;
+		}
+
+		kaddr = kmap_atomic(page);
+		memcpy(kaddr, buf + written,
+			min_t(size_t, PAGE_CACHE_SIZE, node_size - written));
+		kunmap_atomic(kaddr);
+
+		set_page_dirty(page);
+		page_cache_release(page);
+	}
+
+	hfsplus_mark_inode_dirty(attr_file, HFSPLUS_I_ATTR_DIRTY);
+
+	sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID);
+	if (!sbi->attr_tree)
+		pr_err("failed to load attributes file\n");
+
+failed_header_node_init:
+	kfree(buf);
+
+end_attr_file_creation:
+	iput(attr_file);
+
+	if (!err)
+		atomic_set(&sbi->attr_tree_state, HFSPLUS_VALID_ATTR_TREE);
+	else if (err == -ENOSPC)
+		atomic_set(&sbi->attr_tree_state, HFSPLUS_EMPTY_ATTR_TREE);
+	else
+		atomic_set(&sbi->attr_tree_state, HFSPLUS_FAILED_ATTR_TREE);
+
+	return err;
+}
+
 int __hfsplus_setxattr(struct inode *inode, const char *name,
 			const void *value, size_t size, int flags)
 {
@@ -276,8 +413,9 @@ int __hfsplus_setxattr(struct inode *inode, const char *name,
 	}
 
 	if (!HFSPLUS_SB(inode->i_sb)->attr_tree) {
-		err = -EOPNOTSUPP;
-		goto end_setxattr;
+		err = hfsplus_create_attributes_file(inode->i_sb);
+		if (unlikely(err))
+			goto end_setxattr;
 	}
 
 	if (hfsplus_attr_exists(inode, name)) {
-- 
cgit v1.3


From 5721cf84d9d8d41915f77dd639969b7fa91b6a79 Mon Sep 17 00:00:00 2001
From: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Date: Tue, 12 Nov 2013 15:11:15 -0800
Subject: procfs: clean up proc_reg_get_unmapped_area for 80-column limit

Clean up proc_reg_get_unmapped_area due to its 80-column limit
violation.

Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Tested-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/inode.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 8eaa1ba793fc..28955d4b7218 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -285,19 +285,23 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
 	return rv;
 }
 
-static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags)
+static unsigned long
+proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
+			   unsigned long len, unsigned long pgoff,
+			   unsigned long flags)
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	unsigned long rv = -EIO;
-	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long) = NULL;
+	unsigned long (*get_area)(struct file *, unsigned long, unsigned long,
+				  unsigned long, unsigned long) = NULL;
 	if (use_pde(pde)) {
 #ifdef CONFIG_MMU
-		get_unmapped_area = current->mm->get_unmapped_area;
+		get_area = current->mm->get_unmapped_area;
 #endif
 		if (pde->proc_fops->get_unmapped_area)
-			get_unmapped_area = pde->proc_fops->get_unmapped_area;
-		if (get_unmapped_area)
-			rv = get_unmapped_area(file, orig_addr, len, pgoff, flags);
+			get_area = pde->proc_fops->get_unmapped_area;
+		if (get_area)
+			rv = get_area(file, orig_addr, len, pgoff, flags);
 		unuse_pde(pde);
 	}
 	return rv;
-- 
cgit v1.3


From 1c3fc3e5cc8a81d21b199cb739d5d9c51f3688f4 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 12 Nov 2013 15:11:16 -0800
Subject: kcore: add Kconfig help text

Under Pseudo filesystems, /proc/kcore support has no help.

Fixes a portion of kernel bugzilla #52671:
  https://bugzilla.kernel.org/show_bug.cgi?id=52671

Thanks for David Howells for the help text.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reported-by: <lailavrazda1979@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/Kconfig | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 15af6222f8a4..2183fcf41d59 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -31,6 +31,10 @@ config PROC_FS
 config PROC_KCORE
 	bool "/proc/kcore support" if !ARM
 	depends on PROC_FS && MMU
+	help
+	  Provides a virtual ELF core file of the live kernel.  This can
+	  be read with gdb and other ELF tools.  No modifications can be
+	  made using this mechanism.
 
 config PROC_VMCORE
 	bool "/proc/vmcore support"
-- 
cgit v1.3


From d049f74f2dbe71354d43d393ac3a188947811348 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 12 Nov 2013 15:11:17 -0800
Subject: exec/ptrace: fix get_dumpable() incorrect tests

The get_dumpable() return value is not boolean.  Most users of the
function actually want to be testing for non-SUID_DUMP_USER(1) rather than
SUID_DUMP_DISABLE(0).  The SUID_DUMP_ROOT(2) is also considered a
protected state.  Almost all places did this correctly, excepting the two
places fixed in this patch.

Wrong logic:
    if (dumpable == SUID_DUMP_DISABLE) { /* be protective */ }
        or
    if (dumpable == 0) { /* be protective */ }
        or
    if (!dumpable) { /* be protective */ }

Correct logic:
    if (dumpable != SUID_DUMP_USER) { /* be protective */ }
        or
    if (dumpable != 1) { /* be protective */ }

Without this patch, if the system had set the sysctl fs/suid_dumpable=2, a
user was able to ptrace attach to processes that had dropped privileges to
that user.  (This may have been partially mitigated if Yama was enabled.)

The macros have been moved into the file that declares get/set_dumpable(),
which means things like the ia64 code can see them too.

CVE-2013-2929

Reported-by: Vasily Kulikov <segoon@openwall.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/include/asm/processor.h | 2 +-
 fs/exec.c                         | 6 ++++++
 include/linux/binfmts.h           | 3 ---
 include/linux/sched.h             | 4 ++++
 kernel/ptrace.c                   | 3 ++-
 5 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h
index e0a899a1a8a6..5a84b3a50741 100644
--- a/arch/ia64/include/asm/processor.h
+++ b/arch/ia64/include/asm/processor.h
@@ -319,7 +319,7 @@ struct thread_struct {
 	regs->loadrs = 0;									\
 	regs->r8 = get_dumpable(current->mm);	/* set "don't zap registers" flag */		\
 	regs->r12 = new_sp - 16;	/* allocate 16 byte scratch area */			\
-	if (unlikely(!get_dumpable(current->mm))) {							\
+	if (unlikely(get_dumpable(current->mm) != SUID_DUMP_USER)) {	\
 		/*										\
 		 * Zap scratch regs to avoid leaking bits between processes with different	\
 		 * uid/privileges.								\
diff --git a/fs/exec.c b/fs/exec.c
index 2ea437e5acf4..12120620f040 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1669,6 +1669,12 @@ int __get_dumpable(unsigned long mm_flags)
 	return (ret > SUID_DUMP_USER) ? SUID_DUMP_ROOT : ret;
 }
 
+/*
+ * This returns the actual value of the suid_dumpable flag. For things
+ * that are using this for checking for privilege transitions, it must
+ * test against SUID_DUMP_USER rather than treating it as a boolean
+ * value.
+ */
 int get_dumpable(struct mm_struct *mm)
 {
 	return __get_dumpable(mm->flags);
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index e8112ae50531..7554fd410bcc 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -99,9 +99,6 @@ extern void setup_new_exec(struct linux_binprm * bprm);
 extern void would_dump(struct linux_binprm *, struct file *);
 
 extern int suid_dumpable;
-#define SUID_DUMP_DISABLE	0	/* No setuid dumping */
-#define SUID_DUMP_USER		1	/* Dump as user of process */
-#define SUID_DUMP_ROOT		2	/* Dump as root */
 
 /* Stack area protections */
 #define EXSTACK_DEFAULT   0	/* Whatever the arch defaults to */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5e226fe3e512..f7efc8604652 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -323,6 +323,10 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
 extern void set_dumpable(struct mm_struct *mm, int value);
 extern int get_dumpable(struct mm_struct *mm);
 
+#define SUID_DUMP_DISABLE	0	/* No setuid dumping */
+#define SUID_DUMP_USER		1	/* Dump as user of process */
+#define SUID_DUMP_ROOT		2	/* Dump as root */
+
 /* mm flags */
 /* dumpable bits */
 #define MMF_DUMPABLE      0  /* core dump is permitted */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index dd562e9aa2c8..1f4bcb3cc21c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -257,7 +257,8 @@ ok:
 	if (task->mm)
 		dumpable = get_dumpable(task->mm);
 	rcu_read_lock();
-	if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
+	if (dumpable != SUID_DUMP_USER &&
+	    !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
 		rcu_read_unlock();
 		return -EPERM;
 	}
-- 
cgit v1.3


From 66da0e1f9034140ae2f571ef96e254a25083906c Mon Sep 17 00:00:00 2001
From: Ilija Hadzic <ihadzic@research.bell-labs.com>
Date: Tue, 12 Nov 2013 15:11:45 -0800
Subject: devpts: plug the memory leak in kill_sb

When devpts is unmounted, there may be a no-longer-used IDR tree hanging
off the superblock we are about to kill.  This needs to be cleaned up
before destroying the SB.

The leak is usually not a big deal because unmounting devpts is typically
done when shutting down the whole machine.  However, shutting down an LXC
container instead of a physical machine exposes the problem (the garbage
is detectable with kmemleak).

Signed-off-by: Ilija Hadzic <ihadzic@research.bell-labs.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/devpts/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 073d30b9d1ac..a726b9f29cb7 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -498,6 +498,7 @@ static void devpts_kill_sb(struct super_block *sb)
 {
 	struct pts_fs_info *fsi = DEVPTS_SB(sb);
 
+	ida_destroy(&fsi->allocated_ptys);
 	kfree(fsi);
 	kill_litter_super(sb);
 }
-- 
cgit v1.3


From 7f62656be8a8ef14c168db2d98021fb9c8cc1076 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 13 Nov 2013 10:49:40 +0300
Subject: aio: checking for NULL instead of IS_ERR

alloc_anon_inode() returns an ERR_PTR(), it doesn't return NULL.

Fixes: 71ad7490c1f3 ('rework aio migrate pages to use aio fs')
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/aio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 1f602d9be4c5..823efcbb6ccd 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -163,8 +163,8 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 	struct file *file;
 	struct path path;
 	struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
 
 	inode->i_mapping->a_ops = &aio_ctx_aops;
 	inode->i_mapping->private_data = ctx;
-- 
cgit v1.3


From 4fdb793ffe262cfc0b796d3b2cd05ca3b687deb9 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 13 Nov 2013 10:56:27 +0300
Subject: locks: missing unlock on error in generic_add_lease()

We should unlock here before returning.

Fixes: df4e8d2c1d2b ('locks: implement delegations')
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/locks.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index f99d52bdd05a..92a0f0a52b06 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1494,6 +1494,7 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
 
 	if (is_deleg && arg == F_WRLCK) {
 		/* Write delegations are not currently supported: */
+		mutex_unlock(&inode->i_mutex);
 		WARN_ON_ONCE(1);
 		return -EINVAL;
 	}
-- 
cgit v1.3


From 4ec6c2aeab8a909418ac60b782a963cb7257207f Mon Sep 17 00:00:00 2001
From: Li Zhong <zhong@linux.vnet.ibm.com>
Date: Wed, 13 Nov 2013 15:21:51 +0800
Subject: fix unpaired rcu lock in prepend_path()

Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 1f24cd684c51..eacc853a2e3a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2924,6 +2924,7 @@ static int prepend_path(const struct path *path,
 restart_mnt:
 	read_seqbegin_or_lock(&mount_lock, &m_seq);
 	seq = 0;
+	rcu_read_lock();
 restart:
 	bptr = *buffer;
 	blen = *buflen;
@@ -2971,6 +2972,9 @@ restart:
 		goto restart;
 	}
 	done_seqretry(&rename_lock, seq);
+
+	if (!(m_seq & 1))
+		rcu_read_unlock();
 	if (need_seqretry(&mount_lock, m_seq)) {
 		m_seq = 1;
 		goto restart_mnt;
-- 
cgit v1.3


From ede4cebce16f5643c61aedd6d88d9070a1d23a68 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 13 Nov 2013 07:45:40 -0500
Subject: prepend_path() needs to reinitialize dentry/vfsmount/mnt on restarts

... and equivalent is needed in 3.12; it's broken there as well

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index eacc853a2e3a..a9dd384c5e80 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2912,9 +2912,9 @@ static int prepend_path(const struct path *path,
 			const struct path *root,
 			char **buffer, int *buflen)
 {
-	struct dentry *dentry = path->dentry;
-	struct vfsmount *vfsmnt = path->mnt;
-	struct mount *mnt = real_mount(vfsmnt);
+	struct dentry *dentry;
+	struct vfsmount *vfsmnt;
+	struct mount *mnt;
 	int error = 0;
 	unsigned seq, m_seq = 0;
 	char *bptr;
@@ -2929,6 +2929,9 @@ restart:
 	bptr = *buffer;
 	blen = *buflen;
 	error = 0;
+	dentry = path->dentry;
+	vfsmnt = path->mnt;
+	mnt = real_mount(vfsmnt);
 	read_seqbegin_or_lock(&rename_lock, &seq);
 	while (dentry != root->dentry || vfsmnt != root->mnt) {
 		struct dentry * parent;
-- 
cgit v1.3


From 6ff40decff0ef35a5d755ec60182d7f803356dfb Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 5 Nov 2013 15:07:16 -0500
Subject: nfsd4: improve write performance with better sendspace reservations

Currently the rpc code conservatively refuses to accept rpc's from a
client if the sum of its worst-case estimates of the replies it owes
that client exceed the send buffer space.

Unfortunately our estimate of the worst-case reply for an NFSv4 compound
is always the maximum read size.  This can unnecessarily limit the
number of operations we handle concurrently, for example in the case
most operations are writes (which have small replies).

We can do a little better if we check which ops the compound contains.

This is still a rough estimate, we'll need to improve on it some day.

Reported-by: Shyam Kaushik <shyamnfs1@gmail.com>
Tested-by: Shyam Kaushik <shyamnfs1@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index d9d7fa94967f..9d76ee311ddc 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1597,12 +1597,39 @@ nfsd4_opnum_in_range(struct nfsd4_compoundargs *argp, struct nfsd4_op *op)
 	return true;
 }
 
+/*
+ * Return a rough estimate of the maximum possible reply size.  Note the
+ * estimate includes rpc headers so is meant to be passed to
+ * svc_reserve, not svc_reserve_auth.
+ *
+ * Also note the current compound encoding permits only one operation to
+ * use pages beyond the first one, so the maximum possible length is the
+ * maximum over these values, not the sum.
+ */
+static int nfsd4_max_reply(u32 opnum)
+{
+	switch (opnum) {
+	case OP_READLINK:
+	case OP_READDIR:
+		/*
+		 * Both of these ops take a single page for data and put
+		 * the head and tail in another page:
+		 */
+		return 2 * PAGE_SIZE;
+	case OP_READ:
+		return INT_MAX;
+	default:
+		return PAGE_SIZE;
+	}
+}
+
 static __be32
 nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 {
 	DECODE_HEAD;
 	struct nfsd4_op *op;
 	bool cachethis = false;
+	int max_reply = PAGE_SIZE;
 	int i;
 
 	READ_BUF(4);
@@ -1652,10 +1679,14 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 		 * op in the compound wants to be cached:
 		 */
 		cachethis |= nfsd4_cache_this_op(op);
+
+		max_reply = max(max_reply, nfsd4_max_reply(op->opnum));
 	}
 	/* Sessions make the DRC unnecessary: */
 	if (argp->minorversion)
 		cachethis = false;
+	if (max_reply != INT_MAX)
+		svc_reserve(argp->rqstp, max_reply);
 	argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE;
 
 	DECODE_TAIL;
-- 
cgit v1.3


From 6d769f1e1420179d1f83cf1a9cdc585b46c28545 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 13 Nov 2013 09:08:21 -0500
Subject: nfs: don't retry detect_trunking with RPC_AUTH_UNIX more than once

Currently, when we try to mount and get back NFS4ERR_CLID_IN_USE or
NFS4ERR_WRONGSEC, we create a new rpc_clnt and then try the call again.
There is no guarantee that doing so will work however, so we can end up
retrying the call in an infinite loop.

Worse yet, we create the new client using rpc_clone_client_set_auth,
which creates the new client as a child of the old one. Thus, we can end
up with a *very* long lineage of rpc_clnts. When we go to put all of the
references to them, we can end up with a long call chain that can smash
the stack as each rpc_free_client() call can recurse back into itself.

This patch fixes this by simply ensuring that the SETCLIENTID call will
only be retried in this situation if the last attempt did not use
RPC_AUTH_UNIX.

Note too that with this change, we don't need the (i > 2) check in the
-EACCES case since we now have a more reliable test as to whether we
should reattempt.

Cc: stable@vger.kernel.org # v3.10+
Cc: Chuck Lever <chuck.lever@oracle.com>
Tested-by/Acked-by: Weston Andros Adamson <dros@netapp.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4state.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index c8e729deb4f7..6f0470667f95 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2093,10 +2093,15 @@ again:
 			nfs4_root_machine_cred(clp);
 			goto again;
 		}
-		if (i > 2)
+		if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX)
 			break;
 	case -NFS4ERR_CLID_INUSE:
 	case -NFS4ERR_WRONGSEC:
+		/* No point in retrying if we already used RPC_AUTH_UNIX */
+		if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX) {
+			status = -EPERM;
+			break;
+		}
 		clnt = rpc_clone_client_set_auth(clnt, RPC_AUTH_UNIX);
 		if (IS_ERR(clnt)) {
 			status = PTR_ERR(clnt);
-- 
cgit v1.3


From 41c1358e9181ab1ebd773905b3fa8039b61aa0e9 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Thu, 14 Nov 2013 00:05:36 -0600
Subject: CIFS: SMB2/SMB3 Copy offload support (refcopy) phase 1

This first patch adds the ability for us to do a server side copy
(ie fast copy offloaded to the server to perform, aka refcopy)

"cp --reflink"

of one file to another located on the same server.  This
is much faster than traditional copy (which requires
reading and writing over the network and extra
memcpys).

This first version is not going to be copy
files larger than about 1MB (to Samba) until I add
support for multiple chunks and for autoconfiguring
the chunksize.

It includes:
1) processing of the ioctl
2) marshalling and sending the SMB2/SMB3 fsctl over the network
3) simple parsing of the response

It does not include yet (these will be in followon patches to come soon):
1) support for multiple chunks
2) support for autoconfiguring and remembering the chunksize
3) Support for the older style copychunk which Samba 4.1 server supports
(because this requires write permission on the target file, which
cp does not give you, apparently per-posix).  This may require
a distinct tool (other than cp) and other ioctl to implement.

Reviewed-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsglob.h |   3 ++
 fs/cifs/ioctl.c    | 111 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/cifs/smb2ops.c  |  82 +++++++++++++++++++++++++++++++++++++++
 fs/cifs/smb2pdu.h  |  15 +++++++-
 4 files changed, 210 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index cddb807addde..50a6ca1bb521 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -381,6 +381,9 @@ struct smb_version_operations {
 	char * (*create_lease_buf)(u8 *, u8);
 	/* parse lease context buffer and return oplock/epoch info */
 	__u8 (*parse_lease_buf)(void *, unsigned int *);
+	int (*clone_range)(const unsigned int, struct cifsFileInfo *src_file,
+			struct cifsFileInfo *target_file, u64 src_off, u64 len,
+			u64 dest_off);
 };
 
 struct smb_version_values {
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index ba54bf6ab116..409b45eefe70 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -22,12 +22,120 @@
  */
 
 #include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/btrfs.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
 #include "cifs_debug.h"
 #include "cifsfs.h"
 
+static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
+			unsigned long srcfd, u64 off, u64 len, u64 destoff)
+{
+	int rc;
+	struct cifsFileInfo *smb_file_target = dst_file->private_data;
+	struct inode *target_inode = file_inode(dst_file);
+	struct cifs_tcon *target_tcon;
+	struct fd src_file;
+	struct cifsFileInfo *smb_file_src;
+	struct inode *src_inode;
+	struct cifs_tcon *src_tcon;
+
+	cifs_dbg(FYI, "ioctl clone range\n");
+	/* the destination must be opened for writing */
+	if (!(dst_file->f_mode & FMODE_WRITE)) {
+		cifs_dbg(FYI, "file target not open for write\n");
+		return -EINVAL;
+	}
+
+	/* check if target volume is readonly and take reference */
+	rc = mnt_want_write_file(dst_file);
+	if (rc) {
+		cifs_dbg(FYI, "mnt_want_write failed with rc %d\n", rc);
+		return rc;
+	}
+
+	src_file = fdget(srcfd);
+	if (!src_file.file) {
+		rc = -EBADF;
+		goto out_drop_write;
+	}
+
+	if ((!src_file.file->private_data) || (!dst_file->private_data)) {
+		rc = -EBADF;
+		cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n");
+		goto out_fput;
+	}
+
+	rc = -EXDEV;
+	smb_file_target = dst_file->private_data;
+	smb_file_src = src_file.file->private_data;
+	src_tcon = tlink_tcon(smb_file_src->tlink);
+	target_tcon = tlink_tcon(smb_file_target->tlink);
+
+	/* check if source and target are on same tree connection */
+	if (src_tcon != target_tcon) {
+		cifs_dbg(VFS, "file copy src and target on different volume\n");
+		goto out_fput;
+	}
+
+	src_inode = src_file.file->f_dentry->d_inode;
+
+	/*
+	 * Note: cifs case is easier than btrfs since server responsible for
+	 * checks for proper open modes and file type and if it wants
+	 * server could even support copy of range where source = target
+	 */
+
+	/* so we do not deadlock racing two ioctls on same files */
+	if (target_inode < src_inode) {
+		mutex_lock_nested(&target_inode->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD);
+	} else {
+		mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&target_inode->i_mutex, I_MUTEX_CHILD);
+	}
+
+	/* determine range to clone */
+	rc = -EINVAL;
+	if (off + len > src_inode->i_size || off + len < off)
+		goto out_unlock;
+	if (len == 0)
+		len = src_inode->i_size - off;
+
+	cifs_dbg(FYI, "about to flush pages\n");
+	/* should we flush first and last page first */
+	truncate_inode_pages_range(&target_inode->i_data, destoff,
+				   PAGE_CACHE_ALIGN(destoff + len)-1);
+
+	if (target_tcon->ses->server->ops->clone_range)
+		rc = target_tcon->ses->server->ops->clone_range(xid,
+			smb_file_src, smb_file_target, off, len, destoff);
+
+	/* force revalidate of size and timestamps of target file now
+	   that target is updated on the server */
+	CIFS_I(target_inode)->time = 0;
+out_unlock:
+	/* although unlocking in the reverse order from locking is not
+	   strictly necessary here it is a little cleaner to be consistent */
+	if (target_inode < src_inode) {
+		mutex_unlock(&src_inode->i_mutex);
+		mutex_unlock(&target_inode->i_mutex);
+	} else {
+		mutex_unlock(&target_inode->i_mutex);
+		mutex_unlock(&src_inode->i_mutex);
+	}
+out_fput:
+	fdput(src_file);
+out_drop_write:
+	mnt_drop_write_file(dst_file);
+	return rc;
+}
+
 long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 {
 	struct inode *inode = file_inode(filep);
@@ -105,6 +213,9 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 				cifs_dbg(FYI, "set compress flag rc %d\n", rc);
 			}
 			break;
+		case BTRFS_IOC_CLONE:
+			rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0);
+			break;
 		default:
 			cifs_dbg(FYI, "unsupported ioctl\n");
 			break;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index c571be8cb76e..11dde4b24f8a 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -493,6 +493,85 @@ smb2_close_file(const unsigned int xid, struct cifs_tcon *tcon,
 	SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid);
 }
 
+static int
+SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon,
+		     u64 persistent_fid, u64 volatile_fid,
+		     struct copychunk_ioctl *pcchunk)
+{
+	int rc;
+	unsigned int ret_data_len;
+	struct resume_key_req *res_key;
+
+	rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
+			FSCTL_SRV_REQUEST_RESUME_KEY, true /* is_fsctl */,
+			NULL, 0 /* no input */,
+			(char **)&res_key, &ret_data_len);
+
+	if (rc) {
+		cifs_dbg(VFS, "refcpy ioctl error %d getting resume key\n", rc);
+		goto req_res_key_exit;
+	}
+	if (ret_data_len < sizeof(struct resume_key_req)) {
+		cifs_dbg(VFS, "Invalid refcopy resume key length\n");
+		rc = -EINVAL;
+		goto req_res_key_exit;
+	}
+	memcpy(pcchunk->SourceKey, res_key->ResumeKey, COPY_CHUNK_RES_KEY_SIZE);
+
+req_res_key_exit:
+	kfree(res_key);
+	return rc;
+}
+
+static int
+smb2_clone_range(const unsigned int xid,
+			struct cifsFileInfo *srcfile,
+			struct cifsFileInfo *trgtfile, u64 src_off,
+			u64 len, u64 dest_off)
+{
+	int rc;
+	unsigned int ret_data_len;
+	struct copychunk_ioctl *pcchunk;
+	char *retbuf = NULL;
+
+	pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL);
+
+	if (pcchunk == NULL)
+		return -ENOMEM;
+
+	cifs_dbg(FYI, "in smb2_clone_range - about to call request res key\n");
+	/* Request a key from the server to identify the source of the copy */
+	rc = SMB2_request_res_key(xid, tlink_tcon(srcfile->tlink),
+				srcfile->fid.persistent_fid,
+				srcfile->fid.volatile_fid, pcchunk);
+
+	/* Note: request_res_key sets res_key null only if rc !=0 */
+	if (rc)
+		return rc;
+
+	/* For now array only one chunk long, will make more flexible later */
+	pcchunk->ChunkCount = __constant_cpu_to_le32(1);
+	pcchunk->Reserved = 0;
+	pcchunk->SourceOffset = cpu_to_le64(src_off);
+	pcchunk->TargetOffset = cpu_to_le64(dest_off);
+	pcchunk->Length = cpu_to_le32(len);
+	pcchunk->Reserved2 = 0;
+
+	/* Request that server copy to target from src file identified by key */
+	rc = SMB2_ioctl(xid, tlink_tcon(trgtfile->tlink),
+			trgtfile->fid.persistent_fid,
+			trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE,
+			true /* is_fsctl */, (char *)pcchunk,
+			sizeof(struct copychunk_ioctl),	&retbuf, &ret_data_len);
+
+	/* BB need to special case rc = EINVAL to alter chunk size */
+
+	cifs_dbg(FYI, "rc %d data length out %d\n", rc, ret_data_len);
+
+	kfree(pcchunk);
+	return rc;
+}
+
 static int
 smb2_flush_file(const unsigned int xid, struct cifs_tcon *tcon,
 		struct cifs_fid *fid)
@@ -1017,6 +1096,7 @@ struct smb_version_operations smb20_operations = {
 	.set_oplock_level = smb2_set_oplock_level,
 	.create_lease_buf = smb2_create_lease_buf,
 	.parse_lease_buf = smb2_parse_lease_buf,
+	.clone_range = smb2_clone_range,
 };
 
 struct smb_version_operations smb21_operations = {
@@ -1090,6 +1170,7 @@ struct smb_version_operations smb21_operations = {
 	.set_oplock_level = smb21_set_oplock_level,
 	.create_lease_buf = smb2_create_lease_buf,
 	.parse_lease_buf = smb2_parse_lease_buf,
+	.clone_range = smb2_clone_range,
 };
 
 struct smb_version_operations smb30_operations = {
@@ -1165,6 +1246,7 @@ struct smb_version_operations smb30_operations = {
 	.set_oplock_level = smb3_set_oplock_level,
 	.create_lease_buf = smb3_create_lease_buf,
 	.parse_lease_buf = smb3_parse_lease_buf,
+	.clone_range = smb2_clone_range,
 };
 
 struct smb_version_values smb20_values = {
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 6183b1b7550f..b50a129572cd 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -534,9 +534,16 @@ struct create_durable {
 	} Data;
 } __packed;
 
+#define COPY_CHUNK_RES_KEY_SIZE	24
+struct resume_key_req {
+	char ResumeKey[COPY_CHUNK_RES_KEY_SIZE];
+	__le32	ContextLength;	/* MBZ */
+	char	Context[0];	/* ignored, Windows sets to 4 bytes of zero */
+} __packed;
+
 /* this goes in the ioctl buffer when doing a copychunk request */
 struct copychunk_ioctl {
-	char SourceKey[24];
+	char SourceKey[COPY_CHUNK_RES_KEY_SIZE];
 	__le32 ChunkCount; /* we are only sending 1 */
 	__le32 Reserved;
 	/* array will only be one chunk long for us */
@@ -546,6 +553,12 @@ struct copychunk_ioctl {
 	__u32 Reserved2;
 } __packed;
 
+struct copychunk_ioctl_rsp {
+	__le32 ChunksWritten;
+	__le32 ChunkBytesWritten;
+	__le32 TotalBytesWritten;
+} __packed;
+
 /* Response and Request are the same format */
 struct validate_negotiate_info {
 	__le32 Capabilities;
-- 
cgit v1.3


From 2000f5beabc9c6baf084de5f7879975408e3652c Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Wed, 13 Nov 2013 10:09:42 -0800
Subject: eCryptfs: file->private_data is always valid

When accessing the lower_file pointer located in private_data of
eCryptfs files, there is no need to check to see if the private_data
pointer has been initialized to a non-NULL value. The file->private_data
and file->private_data->lower_file pointers are always initialized to
non-NULL values in ecryptfs_open().

This change quiets a Smatch warning:

  CHECK   /var/scm/kernel/linux/fs/ecryptfs/file.c
fs/ecryptfs/file.c:321 ecryptfs_unlocked_ioctl() error: potential NULL dereference 'lower_file'.
fs/ecryptfs/file.c:335 ecryptfs_compat_ioctl() error: potential NULL dereference 'lower_file'.

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Geyslan G. Bem <geyslan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ecryptfs/file.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 2229a74aeeed..b1eaa7a1f82c 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -313,11 +313,9 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
 static long
 ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-	struct file *lower_file = NULL;
+	struct file *lower_file = ecryptfs_file_to_lower(file);
 	long rc = -ENOTTY;
 
-	if (ecryptfs_file_to_private(file))
-		lower_file = ecryptfs_file_to_lower(file);
 	if (lower_file->f_op->unlocked_ioctl)
 		rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
 	return rc;
@@ -327,11 +325,9 @@ ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 static long
 ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-	struct file *lower_file = NULL;
+	struct file *lower_file = ecryptfs_file_to_lower(file);
 	long rc = -ENOIOCTLCMD;
 
-	if (ecryptfs_file_to_private(file))
-		lower_file = ecryptfs_file_to_lower(file);
 	if (lower_file->f_op && lower_file->f_op->compat_ioctl)
 		rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
 	return rc;
-- 
cgit v1.3


From aea240f4162d50e0f2d8bd5ea3ba11b5f072add8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 14 Nov 2013 05:50:10 -0800
Subject: nfsd: export proper maximum file size to the client

I noticed that we export a way to high value for the maxfilesize
attribute when debugging a client issue.  The issue didn't turn
out to be related to it, but I think we should export it, so that
clients can limit what write sizes they accept before hitting
the server.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 9d76ee311ddc..088de1355e93 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2391,7 +2391,7 @@ out_acl:
 	if (bmval0 & FATTR4_WORD0_MAXFILESIZE) {
 		if ((buflen -= 8) < 0)
 			goto out_resource;
-		WRITE64(~(u64)0);
+		WRITE64(exp->ex_path.mnt->mnt_sb->s_maxbytes);
 	}
 	if (bmval0 & FATTR4_WORD0_MAXLINK) {
 		if ((buflen -= 4) < 0)
-- 
cgit v1.3


From e1f56c89b040134add93f686931cc266541d239a Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 14 Nov 2013 14:30:48 -0800
Subject: mm: convert mm->nr_ptes to atomic_long_t

With split page table lock for PMD level we can't hold mm->page_table_lock
while updating nr_ptes.

Let's convert it to atomic_long_t to avoid races.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Tested-by: Alex Thorlton <athorlton@sgi.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: "Eric W . Biederman" <ebiederm@xmission.com>
Cc: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Jones <davej@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Robin Holt <robinmholt@gmail.com>
Cc: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c       |  3 ++-
 include/linux/mm_types.h |  2 +-
 kernel/fork.c            |  2 +-
 mm/huge_memory.c         | 10 +++++-----
 mm/memory.c              |  4 ++--
 mm/mmap.c                |  3 ++-
 mm/oom_kill.c            |  6 +++---
 7 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index abbe825d20ff..8faaebdc6b02 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -62,7 +62,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		total_rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
+		(PTRS_PER_PTE * sizeof(pte_t) *
+		 atomic_long_read(&mm->nr_ptes)) >> 10,
 		swap << (PAGE_SHIFT-10));
 }
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f1ff66dfa79e..566df579c51f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -339,6 +339,7 @@ struct mm_struct {
 	pgd_t * pgd;
 	atomic_t mm_users;			/* How many users with user space? */
 	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
+	atomic_long_t nr_ptes;			/* Page table pages */
 	int map_count;				/* number of VMAs */
 
 	spinlock_t page_table_lock;		/* Protects page tables and some counters */
@@ -360,7 +361,6 @@ struct mm_struct {
 	unsigned long exec_vm;		/* VM_EXEC & ~VM_WRITE */
 	unsigned long stack_vm;		/* VM_GROWSUP/DOWN */
 	unsigned long def_flags;
-	unsigned long nr_ptes;		/* Page table pages */
 	unsigned long start_code, end_code, start_data, end_data;
 	unsigned long start_brk, brk, start_stack;
 	unsigned long arg_start, arg_end, env_start, env_end;
diff --git a/kernel/fork.c b/kernel/fork.c
index f6d11fc67f72..e2520756e005 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -532,7 +532,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 	mm->flags = (current->mm) ?
 		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
 	mm->core_state = NULL;
-	mm->nr_ptes = 0;
+	atomic_long_set(&mm->nr_ptes, 0);
 	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
 	spin_lock_init(&mm->page_table_lock);
 	mm_init_aio(mm);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0556c6a44959..e5b2d316be2e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -738,7 +738,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
 		set_pmd_at(mm, haddr, pmd, entry);
 		add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
-		mm->nr_ptes++;
+		atomic_long_inc(&mm->nr_ptes);
 		spin_unlock(&mm->page_table_lock);
 	}
 
@@ -771,7 +771,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
 	entry = pmd_mkhuge(entry);
 	pgtable_trans_huge_deposit(mm, pmd, pgtable);
 	set_pmd_at(mm, haddr, pmd, entry);
-	mm->nr_ptes++;
+	atomic_long_inc(&mm->nr_ptes);
 	return true;
 }
 
@@ -896,7 +896,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	pmd = pmd_mkold(pmd_wrprotect(pmd));
 	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
 	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
-	dst_mm->nr_ptes++;
+	atomic_long_inc(&dst_mm->nr_ptes);
 
 	ret = 0;
 out_unlock:
@@ -1392,7 +1392,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
 		pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
 		if (is_huge_zero_pmd(orig_pmd)) {
-			tlb->mm->nr_ptes--;
+			atomic_long_dec(&tlb->mm->nr_ptes);
 			spin_unlock(&tlb->mm->page_table_lock);
 			put_huge_zero_page();
 		} else {
@@ -1401,7 +1401,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			VM_BUG_ON(page_mapcount(page) < 0);
 			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
 			VM_BUG_ON(!PageHead(page));
-			tlb->mm->nr_ptes--;
+			atomic_long_dec(&tlb->mm->nr_ptes);
 			spin_unlock(&tlb->mm->page_table_lock);
 			tlb_remove_page(tlb, page);
 		}
diff --git a/mm/memory.c b/mm/memory.c
index bf8665849a5f..0b5a93a49f27 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -382,7 +382,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 	pgtable_t token = pmd_pgtable(*pmd);
 	pmd_clear(pmd);
 	pte_free_tlb(tlb, token, addr);
-	tlb->mm->nr_ptes--;
+	atomic_long_dec(&tlb->mm->nr_ptes);
 }
 
 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -573,7 +573,7 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 	spin_lock(&mm->page_table_lock);
 	wait_split_huge_page = 0;
 	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
-		mm->nr_ptes++;
+		atomic_long_inc(&mm->nr_ptes);
 		pmd_populate(mm, pmd, new);
 		new = NULL;
 	} else if (unlikely(pmd_trans_splitting(*pmd)))
diff --git a/mm/mmap.c b/mm/mmap.c
index 5a6baddde15d..834b2d785f1e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2724,7 +2724,8 @@ void exit_mmap(struct mm_struct *mm)
 	}
 	vm_unacct_memory(nr_accounted);
 
-	WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+	WARN_ON(atomic_long_read(&mm->nr_ptes) >
+			(FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
 }
 
 /* Insert vm structure into process list sorted by address
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 6738c47f1f72..1e4a600a6163 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -161,7 +161,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * The baseline for the badness score is the proportion of RAM that each
 	 * task's rss, pagetable and swap space use.
 	 */
-	points = get_mm_rss(p->mm) + p->mm->nr_ptes +
+	points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) +
 		 get_mm_counter(p->mm, MM_SWAPENTS);
 	task_unlock(p);
 
@@ -364,10 +364,10 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
 			continue;
 		}
 
-		pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu         %5hd %s\n",
+		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu         %5hd %s\n",
 			task->pid, from_kuid(&init_user_ns, task_uid(task)),
 			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
-			task->mm->nr_ptes,
+			atomic_long_read(&task->mm->nr_ptes),
 			get_mm_counter(task->mm, MM_SWAPENTS),
 			task->signal->oom_score_adj, task->comm);
 		task_unlock(task);
-- 
cgit v1.3


From bf929152e9f6c49b66fad4ebf08cc95b02ce48f5 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 14 Nov 2013 14:30:54 -0800
Subject: mm, thp: change pmd_trans_huge_lock() to return taken lock

With split ptlock it's important to know which lock
pmd_trans_huge_lock() took.  This patch adds one more parameter to the
function to return the lock.

In most places migration to new api is trivial.  Exception is
move_huge_pmd(): we need to take two locks if pmd tables are different.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Tested-by: Alex Thorlton <athorlton@sgi.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "Eric W . Biederman" <ebiederm@xmission.com>
Cc: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Jones <davej@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Robin Holt <robinmholt@gmail.com>
Cc: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c      | 13 +++++++------
 include/linux/huge_mm.h | 14 +++++++-------
 mm/huge_memory.c        | 40 +++++++++++++++++++++++++++-------------
 mm/memcontrol.c         | 10 +++++-----
 4 files changed, 46 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8faaebdc6b02..42b5cf5d0326 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -506,9 +506,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	if (pmd_trans_huge_lock(pmd, vma) == 1) {
+	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
-		spin_unlock(&walk->mm->page_table_lock);
+		spin_unlock(ptl);
 		mss->anonymous_thp += HPAGE_PMD_SIZE;
 		return 0;
 	}
@@ -999,13 +999,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 {
 	struct vm_area_struct *vma;
 	struct pagemapread *pm = walk->private;
+	spinlock_t *ptl;
 	pte_t *pte;
 	int err = 0;
 	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
 
 	/* find the first VMA at or above 'addr' */
 	vma = find_vma(walk->mm, addr);
-	if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
+	if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		int pmd_flags2;
 
 		if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
@@ -1023,7 +1024,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			if (err)
 				break;
 		}
-		spin_unlock(&walk->mm->page_table_lock);
+		spin_unlock(ptl);
 		return err;
 	}
 
@@ -1325,7 +1326,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 
 	md = walk->private;
 
-	if (pmd_trans_huge_lock(pmd, md->vma) == 1) {
+	if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
 		pte_t huge_pte = *(pte_t *)pmd;
 		struct page *page;
 
@@ -1333,7 +1334,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 		if (page)
 			gather_stats(page, md, pte_dirty(huge_pte),
 				     HPAGE_PMD_SIZE/PAGE_SIZE);
-		spin_unlock(&walk->mm->page_table_lock);
+		spin_unlock(ptl);
 		return 0;
 	}
 
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 3935428c57cf..4aca0d8da112 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -129,15 +129,15 @@ extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
 				    unsigned long start,
 				    unsigned long end,
 				    long adjust_next);
-extern int __pmd_trans_huge_lock(pmd_t *pmd,
-				 struct vm_area_struct *vma);
+extern int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+		spinlock_t **ptl);
 /* mmap_sem must be held on entry */
-static inline int pmd_trans_huge_lock(pmd_t *pmd,
-				      struct vm_area_struct *vma)
+static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+		spinlock_t **ptl)
 {
 	VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
 	if (pmd_trans_huge(*pmd))
-		return __pmd_trans_huge_lock(pmd, vma);
+		return __pmd_trans_huge_lock(pmd, vma, ptl);
 	else
 		return 0;
 }
@@ -215,8 +215,8 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
 					 long adjust_next)
 {
 }
-static inline int pmd_trans_huge_lock(pmd_t *pmd,
-				      struct vm_area_struct *vma)
+static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+		spinlock_t **ptl)
 {
 	return 0;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e5b2d316be2e..471eb04066ff 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1376,9 +1376,10 @@ out:
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		 pmd_t *pmd, unsigned long addr)
 {
+	spinlock_t *ptl;
 	int ret = 0;
 
-	if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+	if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		struct page *page;
 		pgtable_t pgtable;
 		pmd_t orig_pmd;
@@ -1393,7 +1394,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
 		if (is_huge_zero_pmd(orig_pmd)) {
 			atomic_long_dec(&tlb->mm->nr_ptes);
-			spin_unlock(&tlb->mm->page_table_lock);
+			spin_unlock(ptl);
 			put_huge_zero_page();
 		} else {
 			page = pmd_page(orig_pmd);
@@ -1402,7 +1403,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
 			VM_BUG_ON(!PageHead(page));
 			atomic_long_dec(&tlb->mm->nr_ptes);
-			spin_unlock(&tlb->mm->page_table_lock);
+			spin_unlock(ptl);
 			tlb_remove_page(tlb, page);
 		}
 		pte_free(tlb->mm, pgtable);
@@ -1415,14 +1416,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, unsigned long end,
 		unsigned char *vec)
 {
+	spinlock_t *ptl;
 	int ret = 0;
 
-	if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+	if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		/*
 		 * All logical pages in the range are present
 		 * if backed by a huge page.
 		 */
-		spin_unlock(&vma->vm_mm->page_table_lock);
+		spin_unlock(ptl);
 		memset(vec, 1, (end - addr) >> PAGE_SHIFT);
 		ret = 1;
 	}
@@ -1435,6 +1437,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
 		  unsigned long new_addr, unsigned long old_end,
 		  pmd_t *old_pmd, pmd_t *new_pmd)
 {
+	spinlock_t *old_ptl, *new_ptl;
 	int ret = 0;
 	pmd_t pmd;
 
@@ -1455,12 +1458,21 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
 		goto out;
 	}
 
-	ret = __pmd_trans_huge_lock(old_pmd, vma);
+	/*
+	 * We don't have to worry about the ordering of src and dst
+	 * ptlocks because exclusive mmap_sem prevents deadlock.
+	 */
+	ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl);
 	if (ret == 1) {
+		new_ptl = pmd_lockptr(mm, new_pmd);
+		if (new_ptl != old_ptl)
+			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 		pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
 		VM_BUG_ON(!pmd_none(*new_pmd));
 		set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
-		spin_unlock(&mm->page_table_lock);
+		if (new_ptl != old_ptl)
+			spin_unlock(new_ptl);
+		spin_unlock(old_ptl);
 	}
 out:
 	return ret;
@@ -1476,9 +1488,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, pgprot_t newprot, int prot_numa)
 {
 	struct mm_struct *mm = vma->vm_mm;
+	spinlock_t *ptl;
 	int ret = 0;
 
-	if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+	if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		pmd_t entry;
 		ret = 1;
 		if (!prot_numa) {
@@ -1507,7 +1520,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		if (ret == HPAGE_PMD_NR)
 			set_pmd_at(mm, addr, pmd, entry);
 
-		spin_unlock(&vma->vm_mm->page_table_lock);
+		spin_unlock(ptl);
 	}
 
 	return ret;
@@ -1520,12 +1533,13 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
  * Note that if it returns 1, this routine returns without unlocking page
  * table locks. So callers must unlock them.
  */
-int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
+int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+		spinlock_t **ptl)
 {
-	spin_lock(&vma->vm_mm->page_table_lock);
+	*ptl = pmd_lock(vma->vm_mm, pmd);
 	if (likely(pmd_trans_huge(*pmd))) {
 		if (unlikely(pmd_trans_splitting(*pmd))) {
-			spin_unlock(&vma->vm_mm->page_table_lock);
+			spin_unlock(*ptl);
 			wait_split_huge_page(vma->anon_vma, pmd);
 			return -1;
 		} else {
@@ -1534,7 +1548,7 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
 			return 1;
 		}
 	}
-	spin_unlock(&vma->vm_mm->page_table_lock);
+	spin_unlock(*ptl);
 	return 0;
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e3cd40b2d5d9..f1a0ae6e11b8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6605,10 +6605,10 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	if (pmd_trans_huge_lock(pmd, vma) == 1) {
+	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
 			mc.precharge += HPAGE_PMD_NR;
-		spin_unlock(&vma->vm_mm->page_table_lock);
+		spin_unlock(ptl);
 		return 0;
 	}
 
@@ -6797,9 +6797,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 	 *    to be unlocked in __split_huge_page_splitting(), where the main
 	 *    part of thp split is not executed yet.
 	 */
-	if (pmd_trans_huge_lock(pmd, vma) == 1) {
+	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		if (mc.precharge < HPAGE_PMD_NR) {
-			spin_unlock(&vma->vm_mm->page_table_lock);
+			spin_unlock(ptl);
 			return 0;
 		}
 		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
@@ -6816,7 +6816,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 			}
 			put_page(page);
 		}
-		spin_unlock(&vma->vm_mm->page_table_lock);
+		spin_unlock(ptl);
 		return 0;
 	}
 
-- 
cgit v1.3


From cb900f41215447433cbc456d1c4294e858a84d7c Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 14 Nov 2013 14:31:02 -0800
Subject: mm, hugetlb: convert hugetlbfs to use split pmd lock

Hugetlb supports multiple page sizes. We use split lock only for PMD
level, but not for PUD.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Tested-by: Alex Thorlton <athorlton@sgi.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "Eric W . Biederman" <ebiederm@xmission.com>
Cc: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Jones <davej@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Robin Holt <robinmholt@gmail.com>
Cc: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/meminfo.c       |   2 +-
 include/linux/hugetlb.h |  26 ++++++++++++
 include/linux/swapops.h |   7 +--
 mm/hugetlb.c            | 110 +++++++++++++++++++++++++++++-------------------
 mm/mempolicy.c          |   5 ++-
 mm/migrate.c            |   7 +--
 mm/rmap.c               |   2 +-
 7 files changed, 105 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index c805d5b69ba1..a77d2b299199 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -1,8 +1,8 @@
 #include <linux/fs.h>
-#include <linux/hugetlb.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/mmzone.h>
 #include <linux/proc_fs.h>
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 0393270466c3..acd2010328f3 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -392,6 +392,15 @@ static inline int hugepage_migration_support(struct hstate *h)
 	return pmd_huge_support() && (huge_page_shift(h) == PMD_SHIFT);
 }
 
+static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
+					   struct mm_struct *mm, pte_t *pte)
+{
+	if (huge_page_size(h) == PMD_SIZE)
+		return pmd_lockptr(mm, (pmd_t *) pte);
+	VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
+	return &mm->page_table_lock;
+}
+
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
 #define alloc_huge_page_node(h, nid) NULL
@@ -401,6 +410,7 @@ struct hstate {};
 #define hstate_sizelog(s) NULL
 #define hstate_vma(v) NULL
 #define hstate_inode(i) NULL
+#define page_hstate(page) NULL
 #define huge_page_size(h) PAGE_SIZE
 #define huge_page_mask(h) PAGE_MASK
 #define vma_kernel_pagesize(v) PAGE_SIZE
@@ -421,6 +431,22 @@ static inline pgoff_t basepage_index(struct page *page)
 #define dissolve_free_huge_pages(s, e)	do {} while (0)
 #define pmd_huge_support()	0
 #define hugepage_migration_support(h)	0
+
+static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
+					   struct mm_struct *mm, pte_t *pte)
+{
+	return &mm->page_table_lock;
+}
 #endif	/* CONFIG_HUGETLB_PAGE */
 
+static inline spinlock_t *huge_pte_lock(struct hstate *h,
+					struct mm_struct *mm, pte_t *pte)
+{
+	spinlock_t *ptl;
+
+	ptl = huge_pte_lockptr(h, mm, pte);
+	spin_lock(ptl);
+	return ptl;
+}
+
 #endif /* _LINUX_HUGETLB_H */
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 8d4fa82bfb91..c0f75261a728 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -139,7 +139,8 @@ static inline void make_migration_entry_read(swp_entry_t *entry)
 
 extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					unsigned long address);
-extern void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte);
+extern void migration_entry_wait_huge(struct vm_area_struct *vma,
+		struct mm_struct *mm, pte_t *pte);
 #else
 
 #define make_migration_entry(page, write) swp_entry(0, 0)
@@ -151,8 +152,8 @@ static inline int is_migration_entry(swp_entry_t swp)
 static inline void make_migration_entry_read(swp_entry_t *entryp) { }
 static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					 unsigned long address) { }
-static inline void migration_entry_wait_huge(struct mm_struct *mm,
-					pte_t *pte) { }
+static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
+		struct mm_struct *mm, pte_t *pte) { }
 static inline int is_write_migration_entry(swp_entry_t entry)
 {
 	return 0;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0b7656e804d1..7d57af21f49e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2376,6 +2376,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 
 	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
+		spinlock_t *src_ptl, *dst_ptl;
 		src_pte = huge_pte_offset(src, addr);
 		if (!src_pte)
 			continue;
@@ -2387,8 +2388,9 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 		if (dst_pte == src_pte)
 			continue;
 
-		spin_lock(&dst->page_table_lock);
-		spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
+		dst_ptl = huge_pte_lock(h, dst, dst_pte);
+		src_ptl = huge_pte_lockptr(h, src, src_pte);
+		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 		if (!huge_pte_none(huge_ptep_get(src_pte))) {
 			if (cow)
 				huge_ptep_set_wrprotect(src, addr, src_pte);
@@ -2398,8 +2400,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			page_dup_rmap(ptepage);
 			set_huge_pte_at(dst, addr, dst_pte, entry);
 		}
-		spin_unlock(&src->page_table_lock);
-		spin_unlock(&dst->page_table_lock);
+		spin_unlock(src_ptl);
+		spin_unlock(dst_ptl);
 	}
 	return 0;
 
@@ -2442,6 +2444,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	unsigned long address;
 	pte_t *ptep;
 	pte_t pte;
+	spinlock_t *ptl;
 	struct page *page;
 	struct hstate *h = hstate_vma(vma);
 	unsigned long sz = huge_page_size(h);
@@ -2455,25 +2458,25 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	tlb_start_vma(tlb, vma);
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 again:
-	spin_lock(&mm->page_table_lock);
 	for (address = start; address < end; address += sz) {
 		ptep = huge_pte_offset(mm, address);
 		if (!ptep)
 			continue;
 
+		ptl = huge_pte_lock(h, mm, ptep);
 		if (huge_pmd_unshare(mm, &address, ptep))
-			continue;
+			goto unlock;
 
 		pte = huge_ptep_get(ptep);
 		if (huge_pte_none(pte))
-			continue;
+			goto unlock;
 
 		/*
 		 * HWPoisoned hugepage is already unmapped and dropped reference
 		 */
 		if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
 			huge_pte_clear(mm, address, ptep);
-			continue;
+			goto unlock;
 		}
 
 		page = pte_page(pte);
@@ -2484,7 +2487,7 @@ again:
 		 */
 		if (ref_page) {
 			if (page != ref_page)
-				continue;
+				goto unlock;
 
 			/*
 			 * Mark the VMA as having unmapped its page so that
@@ -2501,13 +2504,18 @@ again:
 
 		page_remove_rmap(page);
 		force_flush = !__tlb_remove_page(tlb, page);
-		if (force_flush)
+		if (force_flush) {
+			spin_unlock(ptl);
 			break;
+		}
 		/* Bail out after unmapping reference page if supplied */
-		if (ref_page)
+		if (ref_page) {
+			spin_unlock(ptl);
 			break;
+		}
+unlock:
+		spin_unlock(ptl);
 	}
-	spin_unlock(&mm->page_table_lock);
 	/*
 	 * mmu_gather ran out of room to batch pages, we break out of
 	 * the PTE lock to avoid doing the potential expensive TLB invalidate
@@ -2613,7 +2621,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
  */
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, pte_t *ptep, pte_t pte,
-			struct page *pagecache_page)
+			struct page *pagecache_page, spinlock_t *ptl)
 {
 	struct hstate *h = hstate_vma(vma);
 	struct page *old_page, *new_page;
@@ -2647,8 +2655,8 @@ retry_avoidcopy:
 
 	page_cache_get(old_page);
 
-	/* Drop page_table_lock as buddy allocator may be called */
-	spin_unlock(&mm->page_table_lock);
+	/* Drop page table lock as buddy allocator may be called */
+	spin_unlock(ptl);
 	new_page = alloc_huge_page(vma, address, outside_reserve);
 
 	if (IS_ERR(new_page)) {
@@ -2666,13 +2674,13 @@ retry_avoidcopy:
 			BUG_ON(huge_pte_none(pte));
 			if (unmap_ref_private(mm, vma, old_page, address)) {
 				BUG_ON(huge_pte_none(pte));
-				spin_lock(&mm->page_table_lock);
+				spin_lock(ptl);
 				ptep = huge_pte_offset(mm, address & huge_page_mask(h));
 				if (likely(pte_same(huge_ptep_get(ptep), pte)))
 					goto retry_avoidcopy;
 				/*
-				 * race occurs while re-acquiring page_table_lock, and
-				 * our job is done.
+				 * race occurs while re-acquiring page table
+				 * lock, and our job is done.
 				 */
 				return 0;
 			}
@@ -2680,7 +2688,7 @@ retry_avoidcopy:
 		}
 
 		/* Caller expects lock to be held */
-		spin_lock(&mm->page_table_lock);
+		spin_lock(ptl);
 		if (err == -ENOMEM)
 			return VM_FAULT_OOM;
 		else
@@ -2695,7 +2703,7 @@ retry_avoidcopy:
 		page_cache_release(new_page);
 		page_cache_release(old_page);
 		/* Caller expects lock to be held */
-		spin_lock(&mm->page_table_lock);
+		spin_lock(ptl);
 		return VM_FAULT_OOM;
 	}
 
@@ -2707,10 +2715,10 @@ retry_avoidcopy:
 	mmun_end = mmun_start + huge_page_size(h);
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	/*
-	 * Retake the page_table_lock to check for racing updates
+	 * Retake the page table lock to check for racing updates
 	 * before the page tables are altered
 	 */
-	spin_lock(&mm->page_table_lock);
+	spin_lock(ptl);
 	ptep = huge_pte_offset(mm, address & huge_page_mask(h));
 	if (likely(pte_same(huge_ptep_get(ptep), pte))) {
 		ClearPagePrivate(new_page);
@@ -2724,13 +2732,13 @@ retry_avoidcopy:
 		/* Make the old page be freed below */
 		new_page = old_page;
 	}
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	page_cache_release(new_page);
 	page_cache_release(old_page);
 
 	/* Caller expects lock to be held */
-	spin_lock(&mm->page_table_lock);
+	spin_lock(ptl);
 	return 0;
 }
 
@@ -2778,6 +2786,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct page *page;
 	struct address_space *mapping;
 	pte_t new_pte;
+	spinlock_t *ptl;
 
 	/*
 	 * Currently, we are forced to kill the process in the event the
@@ -2864,7 +2873,8 @@ retry:
 			goto backout_unlocked;
 		}
 
-	spin_lock(&mm->page_table_lock);
+	ptl = huge_pte_lockptr(h, mm, ptep);
+	spin_lock(ptl);
 	size = i_size_read(mapping->host) >> huge_page_shift(h);
 	if (idx >= size)
 		goto backout;
@@ -2885,16 +2895,16 @@ retry:
 
 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
 		/* Optimization, do the COW without a second fault */
-		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
+		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
 	}
 
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 	unlock_page(page);
 out:
 	return ret;
 
 backout:
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 backout_unlocked:
 	unlock_page(page);
 	put_page(page);
@@ -2906,6 +2916,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	pte_t *ptep;
 	pte_t entry;
+	spinlock_t *ptl;
 	int ret;
 	struct page *page = NULL;
 	struct page *pagecache_page = NULL;
@@ -2918,7 +2929,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (ptep) {
 		entry = huge_ptep_get(ptep);
 		if (unlikely(is_hugetlb_entry_migration(entry))) {
-			migration_entry_wait_huge(mm, ptep);
+			migration_entry_wait_huge(vma, mm, ptep);
 			return 0;
 		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
 			return VM_FAULT_HWPOISON_LARGE |
@@ -2974,17 +2985,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (page != pagecache_page)
 		lock_page(page);
 
-	spin_lock(&mm->page_table_lock);
+	ptl = huge_pte_lockptr(h, mm, ptep);
+	spin_lock(ptl);
 	/* Check for a racing update before calling hugetlb_cow */
 	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
-		goto out_page_table_lock;
+		goto out_ptl;
 
 
 	if (flags & FAULT_FLAG_WRITE) {
 		if (!huge_pte_write(entry)) {
 			ret = hugetlb_cow(mm, vma, address, ptep, entry,
-							pagecache_page);
-			goto out_page_table_lock;
+					pagecache_page, ptl);
+			goto out_ptl;
 		}
 		entry = huge_pte_mkdirty(entry);
 	}
@@ -2993,8 +3005,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 						flags & FAULT_FLAG_WRITE))
 		update_mmu_cache(vma, address, ptep);
 
-out_page_table_lock:
-	spin_unlock(&mm->page_table_lock);
+out_ptl:
+	spin_unlock(ptl);
 
 	if (pagecache_page) {
 		unlock_page(pagecache_page);
@@ -3020,9 +3032,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	unsigned long remainder = *nr_pages;
 	struct hstate *h = hstate_vma(vma);
 
-	spin_lock(&mm->page_table_lock);
 	while (vaddr < vma->vm_end && remainder) {
 		pte_t *pte;
+		spinlock_t *ptl = NULL;
 		int absent;
 		struct page *page;
 
@@ -3030,8 +3042,12 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * Some archs (sparc64, sh*) have multiple pte_ts to
 		 * each hugepage.  We have to make sure we get the
 		 * first, for the page indexing below to work.
+		 *
+		 * Note that page table lock is not held when pte is null.
 		 */
 		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
+		if (pte)
+			ptl = huge_pte_lock(h, mm, pte);
 		absent = !pte || huge_pte_none(huge_ptep_get(pte));
 
 		/*
@@ -3043,6 +3059,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 */
 		if (absent && (flags & FOLL_DUMP) &&
 		    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
+			if (pte)
+				spin_unlock(ptl);
 			remainder = 0;
 			break;
 		}
@@ -3062,10 +3080,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		      !huge_pte_write(huge_ptep_get(pte)))) {
 			int ret;
 
-			spin_unlock(&mm->page_table_lock);
+			if (pte)
+				spin_unlock(ptl);
 			ret = hugetlb_fault(mm, vma, vaddr,
 				(flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
-			spin_lock(&mm->page_table_lock);
 			if (!(ret & VM_FAULT_ERROR))
 				continue;
 
@@ -3096,8 +3114,8 @@ same_page:
 			 */
 			goto same_page;
 		}
+		spin_unlock(ptl);
 	}
-	spin_unlock(&mm->page_table_lock);
 	*nr_pages = remainder;
 	*position = vaddr;
 
@@ -3118,13 +3136,15 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	flush_cache_range(vma, address, end);
 
 	mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
-	spin_lock(&mm->page_table_lock);
 	for (; address < end; address += huge_page_size(h)) {
+		spinlock_t *ptl;
 		ptep = huge_pte_offset(mm, address);
 		if (!ptep)
 			continue;
+		ptl = huge_pte_lock(h, mm, ptep);
 		if (huge_pmd_unshare(mm, &address, ptep)) {
 			pages++;
+			spin_unlock(ptl);
 			continue;
 		}
 		if (!huge_pte_none(huge_ptep_get(ptep))) {
@@ -3134,8 +3154,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 			set_huge_pte_at(mm, address, ptep, pte);
 			pages++;
 		}
+		spin_unlock(ptl);
 	}
-	spin_unlock(&mm->page_table_lock);
 	/*
 	 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
 	 * may have cleared our pud entry and done put_page on the page table:
@@ -3298,6 +3318,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 	unsigned long saddr;
 	pte_t *spte = NULL;
 	pte_t *pte;
+	spinlock_t *ptl;
 
 	if (!vma_shareable(vma, addr))
 		return (pte_t *)pmd_alloc(mm, pud, addr);
@@ -3320,13 +3341,14 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 	if (!spte)
 		goto out;
 
-	spin_lock(&mm->page_table_lock);
+	ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
+	spin_lock(ptl);
 	if (pud_none(*pud))
 		pud_populate(mm, pud,
 				(pmd_t *)((unsigned long)spte & PAGE_MASK));
 	else
 		put_page(virt_to_page(spte));
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 out:
 	pte = (pte_t *)pmd_alloc(mm, pud, addr);
 	mutex_unlock(&mapping->i_mmap_mutex);
@@ -3340,7 +3362,7 @@ out:
  * indicated by page_count > 1, unmap is achieved by clearing pud and
  * decrementing the ref count. If count == 1, the pte page is not shared.
  *
- * called with vma->vm_mm->page_table_lock held.
+ * called with page table lock held.
  *
  * returns: 1 successfully unmapped a shared pte page
  *	    0 the underlying pte page is not shared, or it is the last user
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4cc19f6ab6c6..c4403cdf3433 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -525,8 +525,9 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
 #ifdef CONFIG_HUGETLB_PAGE
 	int nid;
 	struct page *page;
+	spinlock_t *ptl;
 
-	spin_lock(&vma->vm_mm->page_table_lock);
+	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
 	page = pte_page(huge_ptep_get((pte_t *)pmd));
 	nid = page_to_nid(page);
 	if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
@@ -536,7 +537,7 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 		isolate_huge_page(page, private);
 unlock:
-	spin_unlock(&vma->vm_mm->page_table_lock);
+	spin_unlock(ptl);
 #else
 	BUG();
 #endif
diff --git a/mm/migrate.c b/mm/migrate.c
index dfc8300ecbb2..01f45cefa4cd 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -130,7 +130,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
 		ptep = huge_pte_offset(mm, addr);
 		if (!ptep)
 			goto out;
-		ptl = &mm->page_table_lock;
+		ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep);
 	} else {
 		pmd = mm_find_pmd(mm, addr);
 		if (!pmd)
@@ -249,9 +249,10 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 	__migration_entry_wait(mm, ptep, ptl);
 }
 
-void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte)
+void migration_entry_wait_huge(struct vm_area_struct *vma,
+		struct mm_struct *mm, pte_t *pte)
 {
-	spinlock_t *ptl = &(mm)->page_table_lock;
+	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
 	__migration_entry_wait(mm, pte, ptl);
 }
 
diff --git a/mm/rmap.c b/mm/rmap.c
index b59d741dcf65..55c8b8dc9ffb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -601,7 +601,7 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
 
 	if (unlikely(PageHuge(page))) {
 		pte = huge_pte_offset(mm, address);
-		ptl = &mm->page_table_lock;
+		ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
 		goto check;
 	}
 
-- 
cgit v1.3


From 839cc2a94cc3665bafe32203c2f095f4dd470a80 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Thu, 14 Nov 2013 14:31:56 -0800
Subject: seq_file: introduce seq_setwidth() and seq_pad()

There are several users who want to know bytes written by seq_*() for
alignment purpose.  Currently they are using %n format for knowing it
because seq_*() returns 0 on success.

This patch introduces seq_setwidth() and seq_pad() for allowing them to
align without using %n format.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Joe Perches <joe@perches.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c            | 15 +++++++++++++++
 include/linux/seq_file.h | 15 +++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'fs')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index a290157265ef..1cd2388ca5bd 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -766,6 +766,21 @@ int seq_write(struct seq_file *seq, const void *data, size_t len)
 }
 EXPORT_SYMBOL(seq_write);
 
+/**
+ * seq_pad - write padding spaces to buffer
+ * @m: seq_file identifying the buffer to which data should be written
+ * @c: the byte to append after padding if non-zero
+ */
+void seq_pad(struct seq_file *m, char c)
+{
+	int size = m->pad_until - m->count;
+	if (size > 0)
+		seq_printf(m, "%*s", size, "");
+	if (c)
+		seq_putc(m, c);
+}
+EXPORT_SYMBOL(seq_pad);
+
 struct list_head *seq_list_start(struct list_head *head, loff_t pos)
 {
 	struct list_head *lh;
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 4e32edc8f506..52e0097f61f0 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -20,6 +20,7 @@ struct seq_file {
 	size_t size;
 	size_t from;
 	size_t count;
+	size_t pad_until;
 	loff_t index;
 	loff_t read_pos;
 	u64 version;
@@ -79,6 +80,20 @@ static inline void seq_commit(struct seq_file *m, int num)
 	}
 }
 
+/**
+ * seq_setwidth - set padding width
+ * @m: the seq_file handle
+ * @size: the max number of bytes to pad.
+ *
+ * Call seq_setwidth() for setting max width, then call seq_printf() etc. and
+ * finally call seq_pad() to pad the remaining bytes.
+ */
+static inline void seq_setwidth(struct seq_file *m, size_t size)
+{
+	m->pad_until = m->count + size;
+}
+void seq_pad(struct seq_file *m, char c);
+
 char *mangle_path(char *s, const char *p, const char *esc);
 int seq_open(struct file *, const struct seq_operations *);
 ssize_t seq_read(struct file *, char __user *, size_t, loff_t *);
-- 
cgit v1.3


From 652586df95e5d76b37d07a11839126dcfede1621 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Thu, 14 Nov 2013 14:31:57 -0800
Subject: seq_file: remove "%n" usage from seq_file users

All seq_printf() users are using "%n" for calculating padding size,
convert them to use seq_setwidth() / seq_pad() pair.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Joe Perches <joe@perches.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/consoles.c   | 10 ++++------
 fs/proc/nommu.c      | 12 +++++-------
 fs/proc/task_mmu.c   | 20 ++++++--------------
 fs/proc/task_nommu.c | 19 ++++++-------------
 net/ipv4/fib_trie.c  | 13 +++++++------
 net/ipv4/ping.c      | 15 +++++++--------
 net/ipv4/tcp_ipv4.c  | 33 +++++++++++++++------------------
 net/ipv4/udp.c       | 15 +++++++--------
 net/phonet/socket.c  | 24 +++++++++++-------------
 net/sctp/objcnt.c    |  9 +++++----
 10 files changed, 73 insertions(+), 97 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index b701eaa482bf..51942d5abcec 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -29,7 +29,6 @@ static int show_console_dev(struct seq_file *m, void *v)
 	char flags[ARRAY_SIZE(con_flags) + 1];
 	struct console *con = v;
 	unsigned int a;
-	int len;
 	dev_t dev = 0;
 
 	if (con->device) {
@@ -47,11 +46,10 @@ static int show_console_dev(struct seq_file *m, void *v)
 			con_flags[a].name : ' ';
 	flags[a] = 0;
 
-	seq_printf(m, "%s%d%n", con->name, con->index, &len);
-	len = 21 - len;
-	if (len < 1)
-		len = 1;
-	seq_printf(m, "%*c%c%c%c (%s)", len, ' ', con->read ? 'R' : '-',
+	seq_setwidth(m, 21 - 1);
+	seq_printf(m, "%s%d", con->name, con->index);
+	seq_pad(m, ' ');
+	seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-',
 			con->write ? 'W' : '-', con->unblank ? 'U' : '-',
 			flags);
 	if (dev)
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index ccfd99bd1c5a..5f9bc8a746c9 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -39,7 +39,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 	unsigned long ino = 0;
 	struct file *file;
 	dev_t dev = 0;
-	int flags, len;
+	int flags;
 
 	flags = region->vm_flags;
 	file = region->vm_file;
@@ -50,8 +50,9 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 		ino = inode->i_ino;
 	}
 
+	seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
 	seq_printf(m,
-		   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
+		   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
 		   region->vm_start,
 		   region->vm_end,
 		   flags & VM_READ ? 'r' : '-',
@@ -59,13 +60,10 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 		   flags & VM_EXEC ? 'x' : '-',
 		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
 		   ((loff_t)region->vm_pgoff) << PAGE_SHIFT,
-		   MAJOR(dev), MINOR(dev), ino, &len);
+		   MAJOR(dev), MINOR(dev), ino);
 
 	if (file) {
-		len = 25 + sizeof(void *) * 6 - len;
-		if (len < 1)
-			len = 1;
-		seq_printf(m, "%*c", len, ' ');
+		seq_pad(m, ' ');
 		seq_path(m, &file->f_path, "");
 	}
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 42b5cf5d0326..fb52b548080d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -84,14 +84,6 @@ unsigned long task_statm(struct mm_struct *mm,
 	return mm->total_vm;
 }
 
-static void pad_len_spaces(struct seq_file *m, int len)
-{
-	len = 25 + sizeof(void*) * 6 - len;
-	if (len < 1)
-		len = 1;
-	seq_printf(m, "%*c", len, ' ');
-}
-
 #ifdef CONFIG_NUMA
 /*
  * These functions are for numa_maps but called in generic **maps seq_file
@@ -269,7 +261,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 	unsigned long long pgoff = 0;
 	unsigned long start, end;
 	dev_t dev = 0;
-	int len;
 	const char *name = NULL;
 
 	if (file) {
@@ -287,7 +278,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 	if (stack_guard_page_end(vma, end))
 		end -= PAGE_SIZE;
 
-	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
+	seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
+	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
 			start,
 			end,
 			flags & VM_READ ? 'r' : '-',
@@ -295,14 +287,14 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 			flags & VM_EXEC ? 'x' : '-',
 			flags & VM_MAYSHARE ? 's' : 'p',
 			pgoff,
-			MAJOR(dev), MINOR(dev), ino, &len);
+			MAJOR(dev), MINOR(dev), ino);
 
 	/*
 	 * Print the dentry name for named mappings, and a
 	 * special [heap] marker for the heap:
 	 */
 	if (file) {
-		pad_len_spaces(m, len);
+		seq_pad(m, ' ');
 		seq_path(m, &file->f_path, "\n");
 		goto done;
 	}
@@ -334,7 +326,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 				name = "[stack]";
 			} else {
 				/* Thread stack in /proc/PID/maps */
-				pad_len_spaces(m, len);
+				seq_pad(m, ' ');
 				seq_printf(m, "[stack:%d]", tid);
 			}
 		}
@@ -342,7 +334,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 
 done:
 	if (name) {
-		pad_len_spaces(m, len);
+		seq_pad(m, ' ');
 		seq_puts(m, name);
 	}
 	seq_putc(m, '\n');
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 56123a6f462e..678455d2d683 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -123,14 +123,6 @@ unsigned long task_statm(struct mm_struct *mm,
 	return size;
 }
 
-static void pad_len_spaces(struct seq_file *m, int len)
-{
-	len = 25 + sizeof(void*) * 6 - len;
-	if (len < 1)
-		len = 1;
-	seq_printf(m, "%*c", len, ' ');
-}
-
 /*
  * display a single VMA to a sequenced file
  */
@@ -142,7 +134,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
 	unsigned long ino = 0;
 	struct file *file;
 	dev_t dev = 0;
-	int flags, len;
+	int flags;
 	unsigned long long pgoff = 0;
 
 	flags = vma->vm_flags;
@@ -155,8 +147,9 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
 		pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
 	}
 
+	seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
 	seq_printf(m,
-		   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
+		   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
 		   vma->vm_start,
 		   vma->vm_end,
 		   flags & VM_READ ? 'r' : '-',
@@ -164,16 +157,16 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
 		   flags & VM_EXEC ? 'x' : '-',
 		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
 		   pgoff,
-		   MAJOR(dev), MINOR(dev), ino, &len);
+		   MAJOR(dev), MINOR(dev), ino);
 
 	if (file) {
-		pad_len_spaces(m, len);
+		seq_pad(m, ' ');
 		seq_path(m, &file->f_path, "");
 	} else if (mm) {
 		pid_t tid = vm_is_stack(priv->task, vma, is_pid);
 
 		if (tid != 0) {
-			pad_len_spaces(m, len);
+			seq_pad(m, ' ');
 			/*
 			 * Thread stack in /proc/PID/task/TID/maps or
 			 * the main process stack.
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index ec9a9ef4ce50..5afeb5aa4c7c 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2523,16 +2523,17 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
 			const struct fib_info *fi = fa->fa_info;
 			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
-			int len;
 
 			if (fa->fa_type == RTN_BROADCAST
 			    || fa->fa_type == RTN_MULTICAST)
 				continue;
 
+			seq_setwidth(seq, 127);
+
 			if (fi)
 				seq_printf(seq,
 					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
-					 "%d\t%08X\t%d\t%u\t%u%n",
+					 "%d\t%08X\t%d\t%u\t%u",
 					 fi->fib_dev ? fi->fib_dev->name : "*",
 					 prefix,
 					 fi->fib_nh->nh_gw, flags, 0, 0,
@@ -2541,15 +2542,15 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
 					 (fi->fib_advmss ?
 					  fi->fib_advmss + 40 : 0),
 					 fi->fib_window,
-					 fi->fib_rtt >> 3, &len);
+					 fi->fib_rtt >> 3);
 			else
 				seq_printf(seq,
 					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
-					 "%d\t%08X\t%d\t%u\t%u%n",
+					 "%d\t%08X\t%d\t%u\t%u",
 					 prefix, 0, flags, 0, 0, 0,
-					 mask, 0, 0, 0, &len);
+					 mask, 0, 0, 0);
 
-			seq_printf(seq, "%*s\n", 127 - len, "");
+			seq_pad(seq, '\n');
 		}
 	}
 
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 9afbdb19f4a2..cbc85f660d54 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -1076,7 +1076,7 @@ void ping_seq_stop(struct seq_file *seq, void *v)
 EXPORT_SYMBOL_GPL(ping_seq_stop);
 
 static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
-		int bucket, int *len)
+		int bucket)
 {
 	struct inet_sock *inet = inet_sk(sp);
 	__be32 dest = inet->inet_daddr;
@@ -1085,7 +1085,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
 	__u16 srcp = ntohs(inet->inet_sport);
 
 	seq_printf(f, "%5d: %08X:%04X %08X:%04X"
-		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n",
+		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",
 		bucket, src, srcp, dest, destp, sp->sk_state,
 		sk_wmem_alloc_get(sp),
 		sk_rmem_alloc_get(sp),
@@ -1093,23 +1093,22 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
 		0, sock_i_ino(sp),
 		atomic_read(&sp->sk_refcnt), sp,
-		atomic_read(&sp->sk_drops), len);
+		atomic_read(&sp->sk_drops));
 }
 
 static int ping_v4_seq_show(struct seq_file *seq, void *v)
 {
+	seq_setwidth(seq, 127);
 	if (v == SEQ_START_TOKEN)
-		seq_printf(seq, "%-127s\n",
-			   "  sl  local_address rem_address   st tx_queue "
+		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
 			   "inode ref pointer drops");
 	else {
 		struct ping_iter_state *state = seq->private;
-		int len;
 
-		ping_v4_format_sock(v, seq, state->bucket, &len);
-		seq_printf(seq, "%*s\n", 127 - len, "");
+		ping_v4_format_sock(v, seq, state->bucket);
 	}
+	seq_pad(seq, '\n');
 	return 0;
 }
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 14bba8a1c5a7..59a6f8b90cd9 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2541,13 +2541,13 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
 EXPORT_SYMBOL(tcp_proc_unregister);
 
 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
-			 struct seq_file *f, int i, kuid_t uid, int *len)
+			 struct seq_file *f, int i, kuid_t uid)
 {
 	const struct inet_request_sock *ireq = inet_rsk(req);
 	long delta = req->expires - jiffies;
 
 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
-		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK%n",
+		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
 		i,
 		ireq->ir_loc_addr,
 		ntohs(inet_sk(sk)->inet_sport),
@@ -2562,11 +2562,10 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
 		0,  /* non standard timer */
 		0, /* open_requests have no inode */
 		atomic_read(&sk->sk_refcnt),
-		req,
-		len);
+		req);
 }
 
-static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
+static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
 {
 	int timer_active;
 	unsigned long timer_expires;
@@ -2605,7 +2604,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
 
 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
-			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d%n",
+			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
 		i, src, srcp, dest, destp, sk->sk_state,
 		tp->write_seq - tp->snd_una,
 		rx_queue,
@@ -2622,12 +2621,11 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
 		tp->snd_cwnd,
 		sk->sk_state == TCP_LISTEN ?
 		    (fastopenq ? fastopenq->max_qlen : 0) :
-		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
-		len);
+		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
 }
 
 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
-			       struct seq_file *f, int i, int *len)
+			       struct seq_file *f, int i)
 {
 	__be32 dest, src;
 	__u16 destp, srcp;
@@ -2639,10 +2637,10 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,
 	srcp  = ntohs(tw->tw_sport);
 
 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
-		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
+		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
-		atomic_read(&tw->tw_refcnt), tw, len);
+		atomic_read(&tw->tw_refcnt), tw);
 }
 
 #define TMPSZ 150
@@ -2651,11 +2649,10 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
 {
 	struct tcp_iter_state *st;
 	struct sock *sk = v;
-	int len;
 
+	seq_setwidth(seq, TMPSZ - 1);
 	if (v == SEQ_START_TOKEN) {
-		seq_printf(seq, "%-*s\n", TMPSZ - 1,
-			   "  sl  local_address rem_address   st tx_queue "
+		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
 			   "inode");
 		goto out;
@@ -2666,16 +2663,16 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
 	case TCP_SEQ_STATE_LISTENING:
 	case TCP_SEQ_STATE_ESTABLISHED:
 		if (sk->sk_state == TCP_TIME_WAIT)
-			get_timewait4_sock(v, seq, st->num, &len);
+			get_timewait4_sock(v, seq, st->num);
 		else
-			get_tcp4_sock(v, seq, st->num, &len);
+			get_tcp4_sock(v, seq, st->num);
 		break;
 	case TCP_SEQ_STATE_OPENREQ:
-		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
+		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
 		break;
 	}
-	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
 out:
+	seq_pad(seq, '\n');
 	return 0;
 }
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 89909dd730dd..de86e5bc4462 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2331,7 +2331,7 @@ EXPORT_SYMBOL(udp_proc_unregister);
 
 /* ------------------------------------------------------------------------ */
 static void udp4_format_sock(struct sock *sp, struct seq_file *f,
-		int bucket, int *len)
+		int bucket)
 {
 	struct inet_sock *inet = inet_sk(sp);
 	__be32 dest = inet->inet_daddr;
@@ -2340,7 +2340,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
 	__u16 srcp	  = ntohs(inet->inet_sport);
 
 	seq_printf(f, "%5d: %08X:%04X %08X:%04X"
-		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n",
+		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",
 		bucket, src, srcp, dest, destp, sp->sk_state,
 		sk_wmem_alloc_get(sp),
 		sk_rmem_alloc_get(sp),
@@ -2348,23 +2348,22 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
 		0, sock_i_ino(sp),
 		atomic_read(&sp->sk_refcnt), sp,
-		atomic_read(&sp->sk_drops), len);
+		atomic_read(&sp->sk_drops));
 }
 
 int udp4_seq_show(struct seq_file *seq, void *v)
 {
+	seq_setwidth(seq, 127);
 	if (v == SEQ_START_TOKEN)
-		seq_printf(seq, "%-127s\n",
-			   "  sl  local_address rem_address   st tx_queue "
+		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
 			   "inode ref pointer drops");
 	else {
 		struct udp_iter_state *state = seq->private;
-		int len;
 
-		udp4_format_sock(v, seq, state->bucket, &len);
-		seq_printf(seq, "%*s\n", 127 - len, "");
+		udp4_format_sock(v, seq, state->bucket);
 	}
+	seq_pad(seq, '\n');
 	return 0;
 }
 
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index 77e38f733496..008214a3d5eb 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -595,26 +595,25 @@ static void pn_sock_seq_stop(struct seq_file *seq, void *v)
 
 static int pn_sock_seq_show(struct seq_file *seq, void *v)
 {
-	int len;
-
+	seq_setwidth(seq, 127);
 	if (v == SEQ_START_TOKEN)
-		seq_printf(seq, "%s%n", "pt  loc  rem rs st tx_queue rx_queue "
-			"  uid inode ref pointer drops", &len);
+		seq_puts(seq, "pt  loc  rem rs st tx_queue rx_queue "
+			"  uid inode ref pointer drops");
 	else {
 		struct sock *sk = v;
 		struct pn_sock *pn = pn_sk(sk);
 
 		seq_printf(seq, "%2d %04X:%04X:%02X %02X %08X:%08X %5d %lu "
-			"%d %pK %d%n",
+			"%d %pK %d",
 			sk->sk_protocol, pn->sobject, pn->dobject,
 			pn->resource, sk->sk_state,
 			sk_wmem_alloc_get(sk), sk_rmem_alloc_get(sk),
 			from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
 			sock_i_ino(sk),
 			atomic_read(&sk->sk_refcnt), sk,
-			atomic_read(&sk->sk_drops), &len);
+			atomic_read(&sk->sk_drops));
 	}
-	seq_printf(seq, "%*s\n", 127 - len, "");
+	seq_pad(seq, '\n');
 	return 0;
 }
 
@@ -785,20 +784,19 @@ static void pn_res_seq_stop(struct seq_file *seq, void *v)
 
 static int pn_res_seq_show(struct seq_file *seq, void *v)
 {
-	int len;
-
+	seq_setwidth(seq, 63);
 	if (v == SEQ_START_TOKEN)
-		seq_printf(seq, "%s%n", "rs   uid inode", &len);
+		seq_puts(seq, "rs   uid inode");
 	else {
 		struct sock **psk = v;
 		struct sock *sk = *psk;
 
-		seq_printf(seq, "%02X %5u %lu%n",
+		seq_printf(seq, "%02X %5u %lu",
 			   (int) (psk - pnres.sk),
 			   from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
-			   sock_i_ino(sk), &len);
+			   sock_i_ino(sk));
 	}
-	seq_printf(seq, "%*s\n", 63 - len, "");
+	seq_pad(seq, '\n');
 	return 0;
 }
 
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index 5ea573b37648..647396baa56f 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -79,12 +79,13 @@ static sctp_dbg_objcnt_entry_t sctp_dbg_objcnt[] = {
  */
 static int sctp_objcnt_seq_show(struct seq_file *seq, void *v)
 {
-	int i, len;
+	int i;
 
 	i = (int)*(loff_t *)v;
-	seq_printf(seq, "%s: %d%n", sctp_dbg_objcnt[i].label,
-				atomic_read(sctp_dbg_objcnt[i].counter), &len);
-	seq_printf(seq, "%*s\n", 127 - len, "");
+	seq_setwidth(seq, 127);
+	seq_printf(seq, "%s: %d", sctp_dbg_objcnt[i].label,
+				atomic_read(sctp_dbg_objcnt[i].counter));
+	seq_pad(seq, '\n');
 	return 0;
 }
 
-- 
cgit v1.3


From 16735d022f72b20ddbb2274b8e109f69575e9b2b Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Thu, 14 Nov 2013 14:32:02 -0800
Subject: tree-wide: use reinit_completion instead of INIT_COMPLETION

Use this new function to make code more comprehensible, since we are
reinitialzing the completion, not initializing.

[akpm@linux-foundation.org: linux-next resyncs]
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Acked-by: Linus Walleij <linus.walleij@linaro.org> (personally at LCE13)
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mach-tegra/apbio.c                          |  2 +-
 arch/powerpc/platforms/powermac/low_i2c.c            |  6 +++---
 arch/powerpc/platforms/pseries/suspend.c             |  2 +-
 crypto/af_alg.c                                      |  2 +-
 crypto/tcrypt.c                                      |  4 ++--
 crypto/testmgr.c                                     | 12 ++++++------
 drivers/ata/libata-eh.c                              |  4 ++--
 drivers/base/power/main.c                            |  4 ++--
 drivers/block/amiflop.c                              |  2 +-
 drivers/block/cciss.c                                |  4 ++--
 drivers/char/hw_random/timeriomem-rng.c              |  2 +-
 drivers/crypto/tegra-aes.c                           |  2 +-
 drivers/firewire/core-transaction.c                  |  2 +-
 drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c           |  2 +-
 drivers/hid/hid-wiimote.h                            |  2 +-
 drivers/hwmon/jz4740-hwmon.c                         |  2 +-
 drivers/i2c/busses/i2c-at91.c                        |  2 +-
 drivers/i2c/busses/i2c-bcm2835.c                     |  2 +-
 drivers/i2c/busses/i2c-davinci.c                     |  2 +-
 drivers/i2c/busses/i2c-designware-core.c             |  2 +-
 drivers/i2c/busses/i2c-ismt.c                        |  2 +-
 drivers/i2c/busses/i2c-mxs.c                         |  2 +-
 drivers/i2c/busses/i2c-omap.c                        |  2 +-
 drivers/i2c/busses/i2c-tegra.c                       |  2 +-
 drivers/i2c/busses/i2c-wmt.c                         |  4 ++--
 drivers/iio/adc/ad_sigma_delta.c                     |  6 +++---
 drivers/iio/adc/nau7802.c                            |  2 +-
 drivers/input/touchscreen/cyttsp_core.c              |  2 +-
 drivers/md/dm-crypt.c                                |  2 +-
 drivers/media/platform/blackfin/bfin_capture.c       |  2 +-
 drivers/media/radio/radio-wl1273.c                   | 10 +++++-----
 drivers/media/radio/si470x/radio-si470x-common.c     |  4 ++--
 drivers/media/rc/iguanair.c                          |  2 +-
 drivers/memstick/core/memstick.c                     |  2 +-
 drivers/memstick/host/r592.c                         |  2 +-
 drivers/misc/mic/card/mic_virtio.c                   |  2 +-
 drivers/misc/mic/host/mic_boot.c                     |  2 +-
 drivers/misc/ti-st/st_kim.c                          | 12 ++++++------
 drivers/mtd/nand/mxc_nand.c                          |  2 +-
 drivers/mtd/nand/r852.c                              |  2 +-
 drivers/mtd/onenand/omap2.c                          | 10 +++++-----
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c  |  2 +-
 drivers/net/ieee802154/at86rf230.c                   |  2 +-
 drivers/net/ieee802154/mrf24j40.c                    |  2 +-
 drivers/net/wireless/ath/ath10k/htc.c                |  4 ++--
 drivers/net/wireless/ath/ath10k/mac.c                | 18 +++++++++---------
 drivers/net/wireless/ath/carl9170/usb.c              |  2 +-
 drivers/net/wireless/ath/wil6210/main.c              |  2 +-
 drivers/net/wireless/brcm80211/brcmfmac/p2p.c        |  4 ++--
 drivers/net/wireless/zd1211rw/zd_usb.c               |  2 +-
 drivers/parport/parport_ip32.c                       |  4 ++--
 drivers/platform/x86/apple-gmux.c                    |  2 +-
 drivers/power/ab8500_fg.c                            |  4 ++--
 drivers/power/jz4740-battery.c                       |  2 +-
 drivers/rtc/rtc-hid-sensor-time.c                    |  2 +-
 drivers/spi/spi-bcm2835.c                            |  2 +-
 drivers/spi/spi-clps711x.c                           |  2 +-
 drivers/spi/spi-davinci.c                            |  2 +-
 drivers/spi/spi-fsl-espi.c                           |  2 +-
 drivers/spi/spi-fsl-spi.c                            |  2 +-
 drivers/spi/spi-mpc512x-psc.c                        |  2 +-
 drivers/spi/spi-mxs.c                                |  2 +-
 drivers/spi/spi-s3c64xx.c                            |  2 +-
 drivers/spi/spi-sh-msiof.c                           |  2 +-
 drivers/spi/spi-sirf.c                               |  4 ++--
 drivers/spi/spi-tegra114.c                           |  6 +++---
 drivers/spi/spi-tegra20-sflash.c                     |  2 +-
 drivers/spi/spi-tegra20-slink.c                      |  6 +++---
 drivers/spi/spi-xilinx.c                             |  2 +-
 drivers/spi/spi.c                                    |  2 +-
 drivers/staging/iio/adc/mxs-lradc.c                  |  2 +-
 drivers/staging/media/solo6x10/solo6x10-p2m.c        |  2 +-
 drivers/staging/tidspbridge/core/sync.c              |  4 ++--
 drivers/staging/tidspbridge/include/dspbridge/sync.h |  2 +-
 drivers/staging/tidspbridge/rmgr/drv_interface.c     |  6 +++---
 drivers/tty/metag_da.c                               |  2 +-
 drivers/usb/c67x00/c67x00-sched.c                    |  2 +-
 drivers/usb/gadget/f_fs.c                            |  2 +-
 drivers/usb/serial/mos7720.c                         |  2 +-
 drivers/video/exynos/exynos_mipi_dsi_common.c        |  4 ++--
 drivers/video/omap2/displays-new/encoder-tpd12s015.c |  2 +-
 fs/ecryptfs/crypto.c                                 |  2 +-
 fs/nfs/nfs4state.c                                   |  2 +-
 fs/ocfs2/dlmglue.c                                   |  4 ++--
 sound/firewire/dice.c                                |  2 +-
 sound/soc/samsung/ac97.c                             |  6 +++---
 86 files changed, 138 insertions(+), 138 deletions(-)

(limited to 'fs')

diff --git a/arch/arm/mach-tegra/apbio.c b/arch/arm/mach-tegra/apbio.c
index d7aa52ea6cfc..bc471973cf04 100644
--- a/arch/arm/mach-tegra/apbio.c
+++ b/arch/arm/mach-tegra/apbio.c
@@ -114,7 +114,7 @@ static int do_dma_transfer(unsigned long apb_add,
 	dma_desc->callback = apb_dma_complete;
 	dma_desc->callback_param = NULL;
 
-	INIT_COMPLETION(tegra_apb_wait);
+	reinit_completion(&tegra_apb_wait);
 
 	dmaengine_submit(dma_desc);
 	dma_async_issue_pending(tegra_apb_dma_chan);
diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c
index fc536f2971c0..7553b6a77c64 100644
--- a/arch/powerpc/platforms/powermac/low_i2c.c
+++ b/arch/powerpc/platforms/powermac/low_i2c.c
@@ -452,7 +452,7 @@ static int kw_i2c_xfer(struct pmac_i2c_bus *bus, u8 addrdir, int subsize,
 	 */
 	if (use_irq) {
 		/* Clear completion */
-		INIT_COMPLETION(host->complete);
+		reinit_completion(&host->complete);
 		/* Ack stale interrupts */
 		kw_write_reg(reg_isr, kw_read_reg(reg_isr));
 		/* Arm timeout */
@@ -717,7 +717,7 @@ static int pmu_i2c_xfer(struct pmac_i2c_bus *bus, u8 addrdir, int subsize,
 			return -EINVAL;
 		}
 
-		INIT_COMPLETION(comp);
+		reinit_completion(&comp);
 		req->data[0] = PMU_I2C_CMD;
 		req->reply[0] = 0xff;
 		req->nbytes = sizeof(struct pmu_i2c_hdr) + 1;
@@ -748,7 +748,7 @@ static int pmu_i2c_xfer(struct pmac_i2c_bus *bus, u8 addrdir, int subsize,
 
 		hdr->bus = PMU_I2C_BUS_STATUS;
 
-		INIT_COMPLETION(comp);
+		reinit_completion(&comp);
 		req->data[0] = PMU_I2C_CMD;
 		req->reply[0] = 0xff;
 		req->nbytes = 2;
diff --git a/arch/powerpc/platforms/pseries/suspend.c b/arch/powerpc/platforms/pseries/suspend.c
index 5f997e79d570..16a255255d30 100644
--- a/arch/powerpc/platforms/pseries/suspend.c
+++ b/arch/powerpc/platforms/pseries/suspend.c
@@ -106,7 +106,7 @@ static int pseries_prepare_late(void)
 	atomic_set(&suspend_data.done, 0);
 	atomic_set(&suspend_data.error, 0);
 	suspend_data.complete = &suspend_work;
-	INIT_COMPLETION(suspend_work);
+	reinit_completion(&suspend_work);
 	return 0;
 }
 
diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index ac33d5f30778..966f893711b3 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -434,7 +434,7 @@ int af_alg_wait_for_completion(int err, struct af_alg_completion *completion)
 	case -EINPROGRESS:
 	case -EBUSY:
 		wait_for_completion(&completion->completion);
-		INIT_COMPLETION(completion->completion);
+		reinit_completion(&completion->completion);
 		err = completion->err;
 		break;
 	};
diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 25a5934f0e50..1ab8258fcf56 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -493,7 +493,7 @@ static inline int do_one_ahash_op(struct ahash_request *req, int ret)
 		ret = wait_for_completion_interruptible(&tr->completion);
 		if (!ret)
 			ret = tr->err;
-		INIT_COMPLETION(tr->completion);
+		reinit_completion(&tr->completion);
 	}
 	return ret;
 }
@@ -721,7 +721,7 @@ static inline int do_one_acipher_op(struct ablkcipher_request *req, int ret)
 		ret = wait_for_completion_interruptible(&tr->completion);
 		if (!ret)
 			ret = tr->err;
-		INIT_COMPLETION(tr->completion);
+		reinit_completion(&tr->completion);
 	}
 
 	return ret;
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index e091ef6e1791..432afc03e7c3 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -179,7 +179,7 @@ static int do_one_async_hash_op(struct ahash_request *req,
 		ret = wait_for_completion_interruptible(&tr->completion);
 		if (!ret)
 			ret = tr->err;
-		INIT_COMPLETION(tr->completion);
+		reinit_completion(&tr->completion);
 	}
 	return ret;
 }
@@ -336,7 +336,7 @@ static int __test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 				ret = wait_for_completion_interruptible(
 					&tresult.completion);
 				if (!ret && !(ret = tresult.err)) {
-					INIT_COMPLETION(tresult.completion);
+					reinit_completion(&tresult.completion);
 					break;
 				}
 				/* fall through */
@@ -543,7 +543,7 @@ static int __test_aead(struct crypto_aead *tfm, int enc,
 				ret = wait_for_completion_interruptible(
 					&result.completion);
 				if (!ret && !(ret = result.err)) {
-					INIT_COMPLETION(result.completion);
+					reinit_completion(&result.completion);
 					break;
 				}
 			case -EBADMSG:
@@ -697,7 +697,7 @@ static int __test_aead(struct crypto_aead *tfm, int enc,
 				ret = wait_for_completion_interruptible(
 					&result.completion);
 				if (!ret && !(ret = result.err)) {
-					INIT_COMPLETION(result.completion);
+					reinit_completion(&result.completion);
 					break;
 				}
 			case -EBADMSG:
@@ -983,7 +983,7 @@ static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc,
 				ret = wait_for_completion_interruptible(
 					&result.completion);
 				if (!ret && !((ret = result.err))) {
-					INIT_COMPLETION(result.completion);
+					reinit_completion(&result.completion);
 					break;
 				}
 				/* fall through */
@@ -1086,7 +1086,7 @@ static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc,
 				ret = wait_for_completion_interruptible(
 					&result.completion);
 				if (!ret && !((ret = result.err))) {
-					INIT_COMPLETION(result.completion);
+					reinit_completion(&result.completion);
 					break;
 				}
 				/* fall through */
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 77bbc8266883..92d7797223be 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -3017,7 +3017,7 @@ static inline void ata_eh_pull_park_action(struct ata_port *ap)
 	 * ourselves at the beginning of each pass over the loop.
 	 *
 	 * Additionally, all write accesses to &ap->park_req_pending
-	 * through INIT_COMPLETION() (see below) or complete_all()
+	 * through reinit_completion() (see below) or complete_all()
 	 * (see ata_scsi_park_store()) are protected by the host lock.
 	 * As a result we have that park_req_pending.done is zero on
 	 * exit from this function, i.e. when ATA_EH_PARK actions for
@@ -3031,7 +3031,7 @@ static inline void ata_eh_pull_park_action(struct ata_port *ap)
 	 */
 
 	spin_lock_irqsave(ap->lock, flags);
-	INIT_COMPLETION(ap->park_req_pending);
+	reinit_completion(&ap->park_req_pending);
 	ata_for_each_link(link, ap, EDGE) {
 		ata_for_each_dev(dev, link, ALL) {
 			struct ata_eh_info *ehi = &link->eh_info;
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index ee039afe9078..c12e9b9556be 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -757,7 +757,7 @@ void dpm_resume(pm_message_t state)
 	async_error = 0;
 
 	list_for_each_entry(dev, &dpm_suspended_list, power.entry) {
-		INIT_COMPLETION(dev->power.completion);
+		reinit_completion(&dev->power.completion);
 		if (is_async(dev)) {
 			get_device(dev);
 			async_schedule(async_resume, dev);
@@ -1237,7 +1237,7 @@ static void async_suspend(void *data, async_cookie_t cookie)
 
 static int device_suspend(struct device *dev)
 {
-	INIT_COMPLETION(dev->power.completion);
+	reinit_completion(&dev->power.completion);
 
 	if (pm_async_enabled && dev->power.async_suspend) {
 		get_device(dev);
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 4ff85b8785ee..748dea4f34dc 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -343,7 +343,7 @@ static int fd_motor_on(int nr)
 		unit[nr].motor = 1;
 		fd_select(nr);
 
-		INIT_COMPLETION(motor_on_completion);
+		reinit_completion(&motor_on_completion);
 		motor_on_timer.data = nr;
 		mod_timer(&motor_on_timer, jiffies + HZ/2);
 
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 0c004ac05811..b35fc4f5237c 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2808,7 +2808,7 @@ resend_cmd2:
 		/* erase the old error information */
 		memset(c->err_info, 0, sizeof(ErrorInfo_struct));
 		return_status = IO_OK;
-		INIT_COMPLETION(wait);
+		reinit_completion(&wait);
 		goto resend_cmd2;
 	}
 
@@ -3669,7 +3669,7 @@ static int add_to_scan_list(struct ctlr_info *h)
 		}
 	}
 	if (!found && !h->busy_scanning) {
-		INIT_COMPLETION(h->scan_wait);
+		reinit_completion(&h->scan_wait);
 		list_add_tail(&h->scan_list, &scan_q);
 		ret = 1;
 	}
diff --git a/drivers/char/hw_random/timeriomem-rng.c b/drivers/char/hw_random/timeriomem-rng.c
index d2120ba8f3f9..73ce739f8e19 100644
--- a/drivers/char/hw_random/timeriomem-rng.c
+++ b/drivers/char/hw_random/timeriomem-rng.c
@@ -79,7 +79,7 @@ static int timeriomem_rng_data_read(struct hwrng *rng, u32 *data)
 	priv->expires = cur + delay;
 	priv->present = 0;
 
-	INIT_COMPLETION(priv->completion);
+	reinit_completion(&priv->completion);
 	mod_timer(&priv->timer, priv->expires);
 
 	return 4;
diff --git a/drivers/crypto/tegra-aes.c b/drivers/crypto/tegra-aes.c
index 2d58da972ae2..fa05e3c329bd 100644
--- a/drivers/crypto/tegra-aes.c
+++ b/drivers/crypto/tegra-aes.c
@@ -268,7 +268,7 @@ static int aes_start_crypt(struct tegra_aes_dev *dd, u32 in_addr, u32 out_addr,
 	aes_writel(dd, value, TEGRA_AES_SECURE_INPUT_SELECT);
 
 	aes_writel(dd, out_addr, TEGRA_AES_SECURE_DEST_ADDR);
-	INIT_COMPLETION(dd->op_complete);
+	reinit_completion(&dd->op_complete);
 
 	for (i = 0; i < AES_HW_MAX_ICQ_LENGTH - 1; i++) {
 		do {
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index e5af0e3a26ec..0e799516a2ab 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -477,7 +477,7 @@ void fw_send_phy_config(struct fw_card *card,
 	phy_config_packet.header[1] = data;
 	phy_config_packet.header[2] = ~data;
 	phy_config_packet.generation = generation;
-	INIT_COMPLETION(phy_config_done);
+	reinit_completion(&phy_config_done);
 
 	card->driver->send_request(card, &phy_config_packet);
 	wait_for_completion_timeout(&phy_config_done, timeout);
diff --git a/drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c b/drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c
index 1eb86c79523e..e28107061148 100644
--- a/drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c
+++ b/drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c
@@ -99,7 +99,7 @@ static int xfer_read(struct i2c_adapter *adap, struct i2c_msg *pmsg)
 	i2c_dev->status = I2C_STAT_INIT;
 	i2c_dev->msg = pmsg;
 	i2c_dev->buf_offset = 0;
-	INIT_COMPLETION(i2c_dev->complete);
+	reinit_completion(&i2c_dev->complete);
 
 	/* Enable I2C transaction */
 	temp = ((pmsg->len) << 20) | HI2C_EDID_READ | HI2C_ENABLE_TRANSACTION;
diff --git a/drivers/hid/hid-wiimote.h b/drivers/hid/hid-wiimote.h
index 75db0c400037..cfa63b0825b0 100644
--- a/drivers/hid/hid-wiimote.h
+++ b/drivers/hid/hid-wiimote.h
@@ -327,7 +327,7 @@ static inline void wiimote_cmd_acquire_noint(struct wiimote_data *wdata)
 static inline void wiimote_cmd_set(struct wiimote_data *wdata, int cmd,
 								__u32 opt)
 {
-	INIT_COMPLETION(wdata->state.ready);
+	reinit_completion(&wdata->state.ready);
 	wdata->state.cmd = cmd;
 	wdata->state.opt = opt;
 }
diff --git a/drivers/hwmon/jz4740-hwmon.c b/drivers/hwmon/jz4740-hwmon.c
index e0d66b9590ab..a183e488db78 100644
--- a/drivers/hwmon/jz4740-hwmon.c
+++ b/drivers/hwmon/jz4740-hwmon.c
@@ -66,7 +66,7 @@ static ssize_t jz4740_hwmon_read_adcin(struct device *dev,
 
 	mutex_lock(&hwmon->lock);
 
-	INIT_COMPLETION(*completion);
+	reinit_completion(completion);
 
 	enable_irq(hwmon->irq);
 	hwmon->cell->enable(to_platform_device(dev));
diff --git a/drivers/i2c/busses/i2c-at91.c b/drivers/i2c/busses/i2c-at91.c
index fd059308affa..8edba9de76df 100644
--- a/drivers/i2c/busses/i2c-at91.c
+++ b/drivers/i2c/busses/i2c-at91.c
@@ -371,7 +371,7 @@ static int at91_do_twi_transfer(struct at91_twi_dev *dev)
 	dev_dbg(dev->dev, "transfer: %s %d bytes.\n",
 		(dev->msg->flags & I2C_M_RD) ? "read" : "write", dev->buf_len);
 
-	INIT_COMPLETION(dev->cmd_complete);
+	reinit_completion(&dev->cmd_complete);
 	dev->transfer_status = 0;
 
 	if (!dev->buf_len) {
diff --git a/drivers/i2c/busses/i2c-bcm2835.c b/drivers/i2c/busses/i2c-bcm2835.c
index ea4b08fc3353..d7e8600f31fb 100644
--- a/drivers/i2c/busses/i2c-bcm2835.c
+++ b/drivers/i2c/busses/i2c-bcm2835.c
@@ -151,7 +151,7 @@ static int bcm2835_i2c_xfer_msg(struct bcm2835_i2c_dev *i2c_dev,
 
 	i2c_dev->msg_buf = msg->buf;
 	i2c_dev->msg_buf_remaining = msg->len;
-	INIT_COMPLETION(i2c_dev->completion);
+	reinit_completion(&i2c_dev->completion);
 
 	bcm2835_i2c_writel(i2c_dev, BCM2835_I2C_C, BCM2835_I2C_C_CLEAR);
 
diff --git a/drivers/i2c/busses/i2c-davinci.c b/drivers/i2c/busses/i2c-davinci.c
index 132369fad4e0..960dec61c64e 100644
--- a/drivers/i2c/busses/i2c-davinci.c
+++ b/drivers/i2c/busses/i2c-davinci.c
@@ -323,7 +323,7 @@ i2c_davinci_xfer_msg(struct i2c_adapter *adap, struct i2c_msg *msg, int stop)
 
 	davinci_i2c_write_reg(dev, DAVINCI_I2C_CNT_REG, dev->buf_len);
 
-	INIT_COMPLETION(dev->cmd_complete);
+	reinit_completion(&dev->cmd_complete);
 	dev->cmd_err = 0;
 
 	/* Take I2C out of reset and configure it as master */
diff --git a/drivers/i2c/busses/i2c-designware-core.c b/drivers/i2c/busses/i2c-designware-core.c
index 5888feef1ac5..e89e3e2145e5 100644
--- a/drivers/i2c/busses/i2c-designware-core.c
+++ b/drivers/i2c/busses/i2c-designware-core.c
@@ -613,7 +613,7 @@ i2c_dw_xfer(struct i2c_adapter *adap, struct i2c_msg msgs[], int num)
 	mutex_lock(&dev->lock);
 	pm_runtime_get_sync(dev->dev);
 
-	INIT_COMPLETION(dev->cmd_complete);
+	reinit_completion(&dev->cmd_complete);
 	dev->msgs = msgs;
 	dev->msgs_num = num;
 	dev->cmd_err = 0;
diff --git a/drivers/i2c/busses/i2c-ismt.c b/drivers/i2c/busses/i2c-ismt.c
index 1672effbcebb..0043ede234c2 100644
--- a/drivers/i2c/busses/i2c-ismt.c
+++ b/drivers/i2c/busses/i2c-ismt.c
@@ -541,7 +541,7 @@ static int ismt_access(struct i2c_adapter *adap, u16 addr,
 		desc->dptr_high = upper_32_bits(dma_addr);
 	}
 
-	INIT_COMPLETION(priv->cmp);
+	reinit_completion(&priv->cmp);
 
 	/* Add the descriptor */
 	ismt_submit_desc(priv);
diff --git a/drivers/i2c/busses/i2c-mxs.c b/drivers/i2c/busses/i2c-mxs.c
index b7c857774708..3aedd86a6468 100644
--- a/drivers/i2c/busses/i2c-mxs.c
+++ b/drivers/i2c/busses/i2c-mxs.c
@@ -505,7 +505,7 @@ static int mxs_i2c_xfer_msg(struct i2c_adapter *adap, struct i2c_msg *msg,
 				return err;
 		}
 	} else {
-		INIT_COMPLETION(i2c->cmd_complete);
+		reinit_completion(&i2c->cmd_complete);
 		ret = mxs_i2c_dma_setup_xfer(adap, msg, flags);
 		if (ret)
 			return ret;
diff --git a/drivers/i2c/busses/i2c-omap.c b/drivers/i2c/busses/i2c-omap.c
index 9967a6f9c2ff..a6a891d7970d 100644
--- a/drivers/i2c/busses/i2c-omap.c
+++ b/drivers/i2c/busses/i2c-omap.c
@@ -543,7 +543,7 @@ static int omap_i2c_xfer_msg(struct i2c_adapter *adap,
 	w |= OMAP_I2C_BUF_RXFIF_CLR | OMAP_I2C_BUF_TXFIF_CLR;
 	omap_i2c_write_reg(dev, OMAP_I2C_BUF_REG, w);
 
-	INIT_COMPLETION(dev->cmd_complete);
+	reinit_completion(&dev->cmd_complete);
 	dev->cmd_err = 0;
 
 	w = OMAP_I2C_CON_EN | OMAP_I2C_CON_MST | OMAP_I2C_CON_STT;
diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c
index c457cb447c66..e661edee4d0c 100644
--- a/drivers/i2c/busses/i2c-tegra.c
+++ b/drivers/i2c/busses/i2c-tegra.c
@@ -544,7 +544,7 @@ static int tegra_i2c_xfer_msg(struct tegra_i2c_dev *i2c_dev,
 	i2c_dev->msg_buf_remaining = msg->len;
 	i2c_dev->msg_err = I2C_ERR_NONE;
 	i2c_dev->msg_read = (msg->flags & I2C_M_RD);
-	INIT_COMPLETION(i2c_dev->msg_complete);
+	reinit_completion(&i2c_dev->msg_complete);
 
 	packet_header = (0 << PACKET_HEADER0_HEADER_SIZE_SHIFT) |
 			PACKET_HEADER0_PROTOCOL_I2C |
diff --git a/drivers/i2c/busses/i2c-wmt.c b/drivers/i2c/busses/i2c-wmt.c
index c65da3d913a0..31395fa8121d 100644
--- a/drivers/i2c/busses/i2c-wmt.c
+++ b/drivers/i2c/busses/i2c-wmt.c
@@ -158,7 +158,7 @@ static int wmt_i2c_write(struct i2c_adapter *adap, struct i2c_msg *pmsg,
 		writew(val, i2c_dev->base + REG_CR);
 	}
 
-	INIT_COMPLETION(i2c_dev->complete);
+	reinit_completion(&i2c_dev->complete);
 
 	if (i2c_dev->mode == I2C_MODE_STANDARD)
 		tcr_val = TCR_STANDARD_MODE;
@@ -247,7 +247,7 @@ static int wmt_i2c_read(struct i2c_adapter *adap, struct i2c_msg *pmsg,
 		writew(val, i2c_dev->base + REG_CR);
 	}
 
-	INIT_COMPLETION(i2c_dev->complete);
+	reinit_completion(&i2c_dev->complete);
 
 	if (i2c_dev->mode == I2C_MODE_STANDARD)
 		tcr_val = TCR_STANDARD_MODE;
diff --git a/drivers/iio/adc/ad_sigma_delta.c b/drivers/iio/adc/ad_sigma_delta.c
index e6fbd3e70981..9a4e0e32a771 100644
--- a/drivers/iio/adc/ad_sigma_delta.c
+++ b/drivers/iio/adc/ad_sigma_delta.c
@@ -188,7 +188,7 @@ static int ad_sd_calibrate(struct ad_sigma_delta *sigma_delta,
 
 	spi_bus_lock(sigma_delta->spi->master);
 	sigma_delta->bus_locked = true;
-	INIT_COMPLETION(sigma_delta->completion);
+	reinit_completion(&sigma_delta->completion);
 
 	ret = ad_sigma_delta_set_mode(sigma_delta, mode);
 	if (ret < 0)
@@ -259,7 +259,7 @@ int ad_sigma_delta_single_conversion(struct iio_dev *indio_dev,
 
 	spi_bus_lock(sigma_delta->spi->master);
 	sigma_delta->bus_locked = true;
-	INIT_COMPLETION(sigma_delta->completion);
+	reinit_completion(&sigma_delta->completion);
 
 	ad_sigma_delta_set_mode(sigma_delta, AD_SD_MODE_SINGLE);
 
@@ -343,7 +343,7 @@ static int ad_sd_buffer_postdisable(struct iio_dev *indio_dev)
 {
 	struct ad_sigma_delta *sigma_delta = iio_device_get_drvdata(indio_dev);
 
-	INIT_COMPLETION(sigma_delta->completion);
+	reinit_completion(&sigma_delta->completion);
 	wait_for_completion_timeout(&sigma_delta->completion, HZ);
 
 	if (!sigma_delta->irq_dis) {
diff --git a/drivers/iio/adc/nau7802.c b/drivers/iio/adc/nau7802.c
index 54c5babe6746..e525aa6475c4 100644
--- a/drivers/iio/adc/nau7802.c
+++ b/drivers/iio/adc/nau7802.c
@@ -190,7 +190,7 @@ static int nau7802_read_irq(struct iio_dev *indio_dev,
 	struct nau7802_state *st = iio_priv(indio_dev);
 	int ret;
 
-	INIT_COMPLETION(st->value_ok);
+	reinit_completion(&st->value_ok);
 	enable_irq(st->client->irq);
 
 	nau7802_sync(st);
diff --git a/drivers/input/touchscreen/cyttsp_core.c b/drivers/input/touchscreen/cyttsp_core.c
index d53e0b72a407..4204841cdc49 100644
--- a/drivers/input/touchscreen/cyttsp_core.c
+++ b/drivers/input/touchscreen/cyttsp_core.c
@@ -242,7 +242,7 @@ static int cyttsp_soft_reset(struct cyttsp *ts)
 	int retval;
 
 	/* wait for interrupt to set ready completion */
-	INIT_COMPLETION(ts->bl_ready);
+	reinit_completion(&ts->bl_ready);
 	ts->state = CY_BL_STATE;
 
 	enable_irq(ts->irq);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 50ea7ed24dce..81b0fa660452 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -950,7 +950,7 @@ static int crypt_convert(struct crypt_config *cc,
 		/* async */
 		case -EBUSY:
 			wait_for_completion(&ctx->restart);
-			INIT_COMPLETION(ctx->restart);
+			reinit_completion(&ctx->restart);
 			/* fall through*/
 		case -EINPROGRESS:
 			this_cc->req = NULL;
diff --git a/drivers/media/platform/blackfin/bfin_capture.c b/drivers/media/platform/blackfin/bfin_capture.c
index 4c1105977090..281916591437 100644
--- a/drivers/media/platform/blackfin/bfin_capture.c
+++ b/drivers/media/platform/blackfin/bfin_capture.c
@@ -422,7 +422,7 @@ static int bcap_start_streaming(struct vb2_queue *vq, unsigned int count)
 		return ret;
 	}
 
-	INIT_COMPLETION(bcap_dev->comp);
+	reinit_completion(&bcap_dev->comp);
 	bcap_dev->stop = false;
 	return 0;
 }
diff --git a/drivers/media/radio/radio-wl1273.c b/drivers/media/radio/radio-wl1273.c
index 97c2c18803ef..9cf6731fb816 100644
--- a/drivers/media/radio/radio-wl1273.c
+++ b/drivers/media/radio/radio-wl1273.c
@@ -375,7 +375,7 @@ static int wl1273_fm_set_tx_freq(struct wl1273_device *radio, unsigned int freq)
 	if (r)
 		return r;
 
-	INIT_COMPLETION(radio->busy);
+	reinit_completion(&radio->busy);
 
 	/* wait for the FR IRQ */
 	r = wait_for_completion_timeout(&radio->busy, msecs_to_jiffies(2000));
@@ -389,7 +389,7 @@ static int wl1273_fm_set_tx_freq(struct wl1273_device *radio, unsigned int freq)
 	if (r)
 		return r;
 
-	INIT_COMPLETION(radio->busy);
+	reinit_completion(&radio->busy);
 
 	/* wait for the POWER_ENB IRQ */
 	r = wait_for_completion_timeout(&radio->busy, msecs_to_jiffies(1000));
@@ -444,7 +444,7 @@ static int wl1273_fm_set_rx_freq(struct wl1273_device *radio, unsigned int freq)
 		goto err;
 	}
 
-	INIT_COMPLETION(radio->busy);
+	reinit_completion(&radio->busy);
 
 	r = wait_for_completion_timeout(&radio->busy, msecs_to_jiffies(2000));
 	if (!r) {
@@ -805,7 +805,7 @@ static int wl1273_fm_set_seek(struct wl1273_device *radio,
 	if (level < SCHAR_MIN || level > SCHAR_MAX)
 		return -EINVAL;
 
-	INIT_COMPLETION(radio->busy);
+	reinit_completion(&radio->busy);
 	dev_dbg(radio->dev, "%s: BUSY\n", __func__);
 
 	r = core->write(core, WL1273_INT_MASK_SET, radio->irq_flags);
@@ -847,7 +847,7 @@ static int wl1273_fm_set_seek(struct wl1273_device *radio,
 	if (r)
 		goto out;
 
-	INIT_COMPLETION(radio->busy);
+	reinit_completion(&radio->busy);
 	dev_dbg(radio->dev, "%s: BUSY\n", __func__);
 
 	r = core->write(core, WL1273_TUNER_MODE_SET, TUNER_MODE_AUTO_SEEK);
diff --git a/drivers/media/radio/si470x/radio-si470x-common.c b/drivers/media/radio/si470x/radio-si470x-common.c
index 5c57e5b0f949..0bd250068285 100644
--- a/drivers/media/radio/si470x/radio-si470x-common.c
+++ b/drivers/media/radio/si470x/radio-si470x-common.c
@@ -218,7 +218,7 @@ static int si470x_set_chan(struct si470x_device *radio, unsigned short chan)
 		goto done;
 
 	/* wait till tune operation has completed */
-	INIT_COMPLETION(radio->completion);
+	reinit_completion(&radio->completion);
 	retval = wait_for_completion_timeout(&radio->completion,
 			msecs_to_jiffies(tune_timeout));
 	if (!retval)
@@ -341,7 +341,7 @@ static int si470x_set_seek(struct si470x_device *radio,
 		return retval;
 
 	/* wait till tune operation has completed */
-	INIT_COMPLETION(radio->completion);
+	reinit_completion(&radio->completion);
 	retval = wait_for_completion_timeout(&radio->completion,
 			msecs_to_jiffies(seek_timeout));
 	if (!retval)
diff --git a/drivers/media/rc/iguanair.c b/drivers/media/rc/iguanair.c
index 19632b1c2190..b53626ba6f49 100644
--- a/drivers/media/rc/iguanair.c
+++ b/drivers/media/rc/iguanair.c
@@ -207,7 +207,7 @@ static int iguanair_send(struct iguanair *ir, unsigned size)
 {
 	int rc;
 
-	INIT_COMPLETION(ir->completion);
+	reinit_completion(&ir->completion);
 
 	ir->urb_out->transfer_buffer_length = size;
 	rc = usb_submit_urb(ir->urb_out, GFP_KERNEL);
diff --git a/drivers/memstick/core/memstick.c b/drivers/memstick/core/memstick.c
index bbf4aea1627d..a0547dbf9806 100644
--- a/drivers/memstick/core/memstick.c
+++ b/drivers/memstick/core/memstick.c
@@ -253,7 +253,7 @@ void memstick_new_req(struct memstick_host *host)
 {
 	if (host->card) {
 		host->retries = cmd_retries;
-		INIT_COMPLETION(host->card->mrq_complete);
+		reinit_completion(&host->card->mrq_complete);
 		host->request(host);
 	}
 }
diff --git a/drivers/memstick/host/r592.c b/drivers/memstick/host/r592.c
index 1b6e91345222..31727bf285d0 100644
--- a/drivers/memstick/host/r592.c
+++ b/drivers/memstick/host/r592.c
@@ -290,7 +290,7 @@ static int r592_transfer_fifo_dma(struct r592_device *dev)
 	dbg_verbose("doing dma transfer");
 
 	dev->dma_error = 0;
-	INIT_COMPLETION(dev->dma_done);
+	reinit_completion(&dev->dma_done);
 
 	/* TODO: hidden assumption about nenth beeing always 1 */
 	sg_count = dma_map_sg(&dev->pci_dev->dev, &dev->req->sg, 1, is_write ?
diff --git a/drivers/misc/mic/card/mic_virtio.c b/drivers/misc/mic/card/mic_virtio.c
index 914cc9b2caad..8aa42e738acc 100644
--- a/drivers/misc/mic/card/mic_virtio.c
+++ b/drivers/misc/mic/card/mic_virtio.c
@@ -493,7 +493,7 @@ static int mic_remove_device(struct mic_device_desc __iomem *d,
 			ioread8(&dc->config_change), ioread8(&d->type), mvdev);
 
 		status = ioread8(&d->status);
-		INIT_COMPLETION(mvdev->reset_done);
+		reinit_completion(&mvdev->reset_done);
 		unregister_virtio_device(&mvdev->vdev);
 		mic_free_card_irq(mvdev->virtio_cookie, mvdev);
 		if (status & VIRTIO_CONFIG_S_DRIVER_OK)
diff --git a/drivers/misc/mic/host/mic_boot.c b/drivers/misc/mic/host/mic_boot.c
index b079c65eed6d..7558d9186438 100644
--- a/drivers/misc/mic/host/mic_boot.c
+++ b/drivers/misc/mic/host/mic_boot.c
@@ -38,7 +38,7 @@ static void mic_reset(struct mic_device *mdev)
 
 #define MIC_RESET_TO (45)
 
-	INIT_COMPLETION(mdev->reset_wait);
+	reinit_completion(&mdev->reset_wait);
 	mdev->ops->reset_fw_ready(mdev);
 	mdev->ops->reset(mdev);
 
diff --git a/drivers/misc/ti-st/st_kim.c b/drivers/misc/ti-st/st_kim.c
index 83907c720594..96853a09788a 100644
--- a/drivers/misc/ti-st/st_kim.c
+++ b/drivers/misc/ti-st/st_kim.c
@@ -218,7 +218,7 @@ static long read_local_version(struct kim_data_s *kim_gdata, char *bts_scr_name)
 
 	pr_debug("%s", __func__);
 
-	INIT_COMPLETION(kim_gdata->kim_rcvd);
+	reinit_completion(&kim_gdata->kim_rcvd);
 	if (4 != st_int_write(kim_gdata->core_data, read_ver_cmd, 4)) {
 		pr_err("kim: couldn't write 4 bytes");
 		return -EIO;
@@ -229,7 +229,7 @@ static long read_local_version(struct kim_data_s *kim_gdata, char *bts_scr_name)
 		pr_err(" waiting for ver info- timed out ");
 		return -ETIMEDOUT;
 	}
-	INIT_COMPLETION(kim_gdata->kim_rcvd);
+	reinit_completion(&kim_gdata->kim_rcvd);
 	/* the positions 12 & 13 in the response buffer provide with the
 	 * chip, major & minor numbers
 	 */
@@ -362,7 +362,7 @@ static long download_firmware(struct kim_data_s *kim_gdata)
 			/* reinit completion before sending for the
 			 * relevant wait
 			 */
-			INIT_COMPLETION(kim_gdata->kim_rcvd);
+			reinit_completion(&kim_gdata->kim_rcvd);
 
 			/*
 			 * Free space found in uart buffer, call st_int_write
@@ -398,7 +398,7 @@ static long download_firmware(struct kim_data_s *kim_gdata)
 				release_firmware(kim_gdata->fw_entry);
 				return -ETIMEDOUT;
 			}
-			INIT_COMPLETION(kim_gdata->kim_rcvd);
+			reinit_completion(&kim_gdata->kim_rcvd);
 			break;
 		case ACTION_DELAY:	/* sleep */
 			pr_info("sleep command in scr");
@@ -474,7 +474,7 @@ long st_kim_start(void *kim_data)
 		gpio_set_value(kim_gdata->nshutdown, GPIO_HIGH);
 		mdelay(100);
 		/* re-initialize the completion */
-		INIT_COMPLETION(kim_gdata->ldisc_installed);
+		reinit_completion(&kim_gdata->ldisc_installed);
 		/* send notification to UIM */
 		kim_gdata->ldisc_install = 1;
 		pr_info("ldisc_install = 1");
@@ -525,7 +525,7 @@ long st_kim_stop(void *kim_data)
 		kim_gdata->kim_pdev->dev.platform_data;
 	struct tty_struct	*tty = kim_gdata->core_data->tty;
 
-	INIT_COMPLETION(kim_gdata->ldisc_installed);
+	reinit_completion(&kim_gdata->ldisc_installed);
 
 	if (tty) {	/* can be called before ldisc is installed */
 		/* Flush any pending characters in the driver and discipline. */
diff --git a/drivers/mtd/nand/mxc_nand.c b/drivers/mtd/nand/mxc_nand.c
index 4edea7f4462f..9dfdb06c508b 100644
--- a/drivers/mtd/nand/mxc_nand.c
+++ b/drivers/mtd/nand/mxc_nand.c
@@ -396,7 +396,7 @@ static void wait_op_done(struct mxc_nand_host *host, int useirq)
 
 	if (useirq) {
 		if (!host->devtype_data->check_int(host)) {
-			INIT_COMPLETION(host->op_completion);
+			reinit_completion(&host->op_completion);
 			irq_control(host, 1);
 			wait_for_completion(&host->op_completion);
 		}
diff --git a/drivers/mtd/nand/r852.c b/drivers/mtd/nand/r852.c
index 9dcf02d22aa8..325930db3f04 100644
--- a/drivers/mtd/nand/r852.c
+++ b/drivers/mtd/nand/r852.c
@@ -181,7 +181,7 @@ static void r852_do_dma(struct r852_device *dev, uint8_t *buf, int do_read)
 	/* Set dma direction */
 	dev->dma_dir = do_read;
 	dev->dma_stage = 1;
-	INIT_COMPLETION(dev->dma_done);
+	reinit_completion(&dev->dma_done);
 
 	dbg_verbose("doing dma %s ", do_read ? "read" : "write");
 
diff --git a/drivers/mtd/onenand/omap2.c b/drivers/mtd/onenand/omap2.c
index 2362909d20c0..6547c84afc3a 100644
--- a/drivers/mtd/onenand/omap2.c
+++ b/drivers/mtd/onenand/omap2.c
@@ -159,7 +159,7 @@ static int omap2_onenand_wait(struct mtd_info *mtd, int state)
 				syscfg = read_reg(c, ONENAND_REG_SYS_CFG1);
 		}
 
-		INIT_COMPLETION(c->irq_done);
+		reinit_completion(&c->irq_done);
 		if (c->gpio_irq) {
 			result = gpio_get_value(c->gpio_irq);
 			if (result == -1) {
@@ -349,7 +349,7 @@ static int omap3_onenand_read_bufferram(struct mtd_info *mtd, int area,
 	omap_set_dma_dest_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC,
 				 dma_dst, 0, 0);
 
-	INIT_COMPLETION(c->dma_done);
+	reinit_completion(&c->dma_done);
 	omap_start_dma(c->dma_channel);
 
 	timeout = jiffies + msecs_to_jiffies(20);
@@ -420,7 +420,7 @@ static int omap3_onenand_write_bufferram(struct mtd_info *mtd, int area,
 	omap_set_dma_dest_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC,
 				 dma_dst, 0, 0);
 
-	INIT_COMPLETION(c->dma_done);
+	reinit_completion(&c->dma_done);
 	omap_start_dma(c->dma_channel);
 
 	timeout = jiffies + msecs_to_jiffies(20);
@@ -499,7 +499,7 @@ static int omap2_onenand_read_bufferram(struct mtd_info *mtd, int area,
 	omap_set_dma_dest_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC,
 				 dma_dst, 0, 0);
 
-	INIT_COMPLETION(c->dma_done);
+	reinit_completion(&c->dma_done);
 	omap_start_dma(c->dma_channel);
 	wait_for_completion(&c->dma_done);
 
@@ -544,7 +544,7 @@ static int omap2_onenand_write_bufferram(struct mtd_info *mtd, int area,
 	omap_set_dma_dest_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC,
 				 dma_dst, 0, 0);
 
-	INIT_COMPLETION(c->dma_done);
+	reinit_completion(&c->dma_done);
 	omap_start_dma(c->dma_channel);
 	wait_for_completion(&c->dma_done);
 
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
index 09810ddd11ec..a01a6a74ee3a 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
@@ -3537,7 +3537,7 @@ int qlcnic_83xx_resume(struct qlcnic_adapter *adapter)
 
 void qlcnic_83xx_reinit_mbx_work(struct qlcnic_mailbox *mbx)
 {
-	INIT_COMPLETION(mbx->completion);
+	reinit_completion(&mbx->completion);
 	set_bit(QLC_83XX_MBX_READY, &mbx->status);
 }
 
diff --git a/drivers/net/ieee802154/at86rf230.c b/drivers/net/ieee802154/at86rf230.c
index 6f10b4964726..2cbe1c249996 100644
--- a/drivers/net/ieee802154/at86rf230.c
+++ b/drivers/net/ieee802154/at86rf230.c
@@ -561,7 +561,7 @@ at86rf230_xmit(struct ieee802154_dev *dev, struct sk_buff *skb)
 
 	spin_lock_irqsave(&lp->lock, flags);
 	lp->is_tx = 1;
-	INIT_COMPLETION(lp->tx_complete);
+	reinit_completion(&lp->tx_complete);
 	spin_unlock_irqrestore(&lp->lock, flags);
 
 	rc = at86rf230_write_fbuf(lp, skb->data, skb->len);
diff --git a/drivers/net/ieee802154/mrf24j40.c b/drivers/net/ieee802154/mrf24j40.c
index 0632d34905c7..c6e46d6e9f75 100644
--- a/drivers/net/ieee802154/mrf24j40.c
+++ b/drivers/net/ieee802154/mrf24j40.c
@@ -343,7 +343,7 @@ static int mrf24j40_tx(struct ieee802154_dev *dev, struct sk_buff *skb)
 	if (ret)
 		goto err;
 
-	INIT_COMPLETION(devrec->tx_complete);
+	reinit_completion(&devrec->tx_complete);
 
 	/* Set TXNTRIG bit of TXNCON to send packet */
 	ret = read_short_reg(devrec, REG_TXNCON, &val);
diff --git a/drivers/net/wireless/ath/ath10k/htc.c b/drivers/net/wireless/ath/ath10k/htc.c
index 3118d7506734..edae50b52806 100644
--- a/drivers/net/wireless/ath/ath10k/htc.c
+++ b/drivers/net/wireless/ath/ath10k/htc.c
@@ -534,7 +534,7 @@ int ath10k_htc_wait_target(struct ath10k_htc *htc)
 	u16 credit_count;
 	u16 credit_size;
 
-	INIT_COMPLETION(htc->ctl_resp);
+	reinit_completion(&htc->ctl_resp);
 
 	status = ath10k_hif_start(htc->ar);
 	if (status) {
@@ -669,7 +669,7 @@ int ath10k_htc_connect_service(struct ath10k_htc *htc,
 	req_msg->flags = __cpu_to_le16(flags);
 	req_msg->service_id = __cpu_to_le16(conn_req->service_id);
 
-	INIT_COMPLETION(htc->ctl_resp);
+	reinit_completion(&htc->ctl_resp);
 
 	status = ath10k_htc_send(htc, ATH10K_HTC_EP_0, skb);
 	if (status) {
diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index 0b1cc516e778..97ac8c87cba2 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -92,7 +92,7 @@ static int ath10k_install_key(struct ath10k_vif *arvif,
 
 	lockdep_assert_held(&ar->conf_mutex);
 
-	INIT_COMPLETION(ar->install_key_done);
+	reinit_completion(&ar->install_key_done);
 
 	ret = ath10k_send_key(arvif, key, cmd, macaddr);
 	if (ret)
@@ -438,7 +438,7 @@ static int ath10k_vdev_start(struct ath10k_vif *arvif)
 
 	lockdep_assert_held(&ar->conf_mutex);
 
-	INIT_COMPLETION(ar->vdev_setup_done);
+	reinit_completion(&ar->vdev_setup_done);
 
 	arg.vdev_id = arvif->vdev_id;
 	arg.dtim_period = arvif->dtim_period;
@@ -491,7 +491,7 @@ static int ath10k_vdev_stop(struct ath10k_vif *arvif)
 
 	lockdep_assert_held(&ar->conf_mutex);
 
-	INIT_COMPLETION(ar->vdev_setup_done);
+	reinit_completion(&ar->vdev_setup_done);
 
 	ret = ath10k_wmi_vdev_stop(ar, arvif->vdev_id);
 	if (ret) {
@@ -1666,7 +1666,7 @@ void ath10k_offchan_tx_work(struct work_struct *work)
 		}
 
 		spin_lock_bh(&ar->data_lock);
-		INIT_COMPLETION(ar->offchan_tx_completed);
+		reinit_completion(&ar->offchan_tx_completed);
 		ar->offchan_tx_skb = skb;
 		spin_unlock_bh(&ar->data_lock);
 
@@ -2476,8 +2476,8 @@ static int ath10k_hw_scan(struct ieee80211_hw *hw,
 		goto exit;
 	}
 
-	INIT_COMPLETION(ar->scan.started);
-	INIT_COMPLETION(ar->scan.completed);
+	reinit_completion(&ar->scan.started);
+	reinit_completion(&ar->scan.completed);
 	ar->scan.in_progress = true;
 	ar->scan.aborting = false;
 	ar->scan.is_roc = false;
@@ -2832,9 +2832,9 @@ static int ath10k_remain_on_channel(struct ieee80211_hw *hw,
 		goto exit;
 	}
 
-	INIT_COMPLETION(ar->scan.started);
-	INIT_COMPLETION(ar->scan.completed);
-	INIT_COMPLETION(ar->scan.on_channel);
+	reinit_completion(&ar->scan.started);
+	reinit_completion(&ar->scan.completed);
+	reinit_completion(&ar->scan.on_channel);
 	ar->scan.in_progress = true;
 	ar->scan.aborting = false;
 	ar->scan.is_roc = true;
diff --git a/drivers/net/wireless/ath/carl9170/usb.c b/drivers/net/wireless/ath/carl9170/usb.c
index 307bc0ddff99..ca115f33746f 100644
--- a/drivers/net/wireless/ath/carl9170/usb.c
+++ b/drivers/net/wireless/ath/carl9170/usb.c
@@ -773,7 +773,7 @@ void carl9170_usb_stop(struct ar9170 *ar)
 	complete_all(&ar->cmd_wait);
 
 	/* This is required to prevent an early completion on _start */
-	INIT_COMPLETION(ar->cmd_wait);
+	reinit_completion(&ar->cmd_wait);
 
 	/*
 	 * Note:
diff --git a/drivers/net/wireless/ath/wil6210/main.c b/drivers/net/wireless/ath/wil6210/main.c
index 0a2844c48a60..fd30cddd5882 100644
--- a/drivers/net/wireless/ath/wil6210/main.c
+++ b/drivers/net/wireless/ath/wil6210/main.c
@@ -250,7 +250,7 @@ int wil_reset(struct wil6210_priv *wil)
 
 	/* init after reset */
 	wil->pending_connect_cid = -1;
-	INIT_COMPLETION(wil->wmi_ready);
+	reinit_completion(&wil->wmi_ready);
 
 	/* TODO: release MAC reset */
 	wil6210_enable_irq(wil);
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c
index d7a974532909..5b5b952d47b1 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c
@@ -1148,7 +1148,7 @@ static s32 brcmf_p2p_af_searching_channel(struct brcmf_p2p_info *p2p)
 
 	pri_vif = p2p->bss_idx[P2PAPI_BSSCFG_PRIMARY].vif;
 
-	INIT_COMPLETION(afx_hdl->act_frm_scan);
+	reinit_completion(&afx_hdl->act_frm_scan);
 	set_bit(BRCMF_P2P_STATUS_FINDING_COMMON_CHANNEL, &p2p->status);
 	afx_hdl->is_active = true;
 	afx_hdl->peer_chan = P2P_INVALID_CHANNEL;
@@ -1501,7 +1501,7 @@ static s32 brcmf_p2p_tx_action_frame(struct brcmf_p2p_info *p2p,
 
 	brcmf_dbg(TRACE, "Enter\n");
 
-	INIT_COMPLETION(p2p->send_af_done);
+	reinit_completion(&p2p->send_af_done);
 	clear_bit(BRCMF_P2P_STATUS_ACTION_TX_COMPLETED, &p2p->status);
 	clear_bit(BRCMF_P2P_STATUS_ACTION_TX_NOACK, &p2p->status);
 
diff --git a/drivers/net/wireless/zd1211rw/zd_usb.c b/drivers/net/wireless/zd1211rw/zd_usb.c
index 7ef0b4a181e1..84d94f572a46 100644
--- a/drivers/net/wireless/zd1211rw/zd_usb.c
+++ b/drivers/net/wireless/zd1211rw/zd_usb.c
@@ -1619,7 +1619,7 @@ static void prepare_read_regs_int(struct zd_usb *usb,
 	atomic_set(&intr->read_regs_enabled, 1);
 	intr->read_regs.req = req;
 	intr->read_regs.req_count = count;
-	INIT_COMPLETION(intr->read_regs.completion);
+	reinit_completion(&intr->read_regs.completion);
 	spin_unlock_irq(&intr->lock);
 }
 
diff --git a/drivers/parport/parport_ip32.c b/drivers/parport/parport_ip32.c
index d4716273651e..c864f82bd37d 100644
--- a/drivers/parport/parport_ip32.c
+++ b/drivers/parport/parport_ip32.c
@@ -1331,7 +1331,7 @@ static unsigned int parport_ip32_fwp_wait_interrupt(struct parport *p)
 			break;
 
 		/* Initialize mutex used to take interrupts into account */
-		INIT_COMPLETION(priv->irq_complete);
+		reinit_completion(&priv->irq_complete);
 
 		/* Enable serviceIntr */
 		parport_ip32_frob_econtrol(p, ECR_SERVINTR, 0);
@@ -1446,7 +1446,7 @@ static size_t parport_ip32_fifo_write_block_dma(struct parport *p,
 	priv->irq_mode = PARPORT_IP32_IRQ_HERE;
 
 	parport_ip32_dma_start(DMA_TO_DEVICE, (void *)buf, len);
-	INIT_COMPLETION(priv->irq_complete);
+	reinit_completion(&priv->irq_complete);
 	parport_ip32_frob_econtrol(p, ECR_DMAEN | ECR_SERVINTR, ECR_DMAEN);
 
 	nfault_timeout = min((unsigned long)physport->cad->timeout,
diff --git a/drivers/platform/x86/apple-gmux.c b/drivers/platform/x86/apple-gmux.c
index 8eea2efbbb6d..605a9be55129 100644
--- a/drivers/platform/x86/apple-gmux.c
+++ b/drivers/platform/x86/apple-gmux.c
@@ -289,7 +289,7 @@ static int gmux_switchto(enum vga_switcheroo_client_id id)
 static int gmux_set_discrete_state(struct apple_gmux_data *gmux_data,
 				   enum vga_switcheroo_state state)
 {
-	INIT_COMPLETION(gmux_data->powerchange_done);
+	reinit_completion(&gmux_data->powerchange_done);
 
 	if (state == VGA_SWITCHEROO_ON) {
 		gmux_write8(gmux_data, GMUX_PORT_DISCRETE_POWER, 1);
diff --git a/drivers/power/ab8500_fg.c b/drivers/power/ab8500_fg.c
index 754970717c31..3cb4178e397c 100644
--- a/drivers/power/ab8500_fg.c
+++ b/drivers/power/ab8500_fg.c
@@ -574,8 +574,8 @@ int ab8500_fg_inst_curr_start(struct ab8500_fg *di)
 	}
 
 	/* Return and WFI */
-	INIT_COMPLETION(di->ab8500_fg_started);
-	INIT_COMPLETION(di->ab8500_fg_complete);
+	reinit_completion(&di->ab8500_fg_started);
+	reinit_completion(&di->ab8500_fg_complete);
 	enable_irq(di->irq);
 
 	/* Note: cc_lock is still locked */
diff --git a/drivers/power/jz4740-battery.c b/drivers/power/jz4740-battery.c
index d9686aa9270a..6c8931d4ad62 100644
--- a/drivers/power/jz4740-battery.c
+++ b/drivers/power/jz4740-battery.c
@@ -73,7 +73,7 @@ static long jz_battery_read_voltage(struct jz_battery *battery)
 
 	mutex_lock(&battery->lock);
 
-	INIT_COMPLETION(battery->read_completion);
+	reinit_completion(&battery->read_completion);
 
 	enable_irq(battery->irq);
 	battery->cell->enable(battery->pdev);
diff --git a/drivers/rtc/rtc-hid-sensor-time.c b/drivers/rtc/rtc-hid-sensor-time.c
index a34e5cfd2ab5..965a9da70867 100644
--- a/drivers/rtc/rtc-hid-sensor-time.c
+++ b/drivers/rtc/rtc-hid-sensor-time.c
@@ -209,7 +209,7 @@ static int hid_rtc_read_time(struct device *dev, struct rtc_time *tm)
 		platform_get_drvdata(to_platform_device(dev));
 	int ret;
 
-	INIT_COMPLETION(time_state->comp_last_time);
+	reinit_completion(&time_state->comp_last_time);
 	/* get a report with all values through requesting one value */
 	sensor_hub_input_attr_get_raw_value(time_state->common_attributes.hsdev,
 			HID_USAGE_SENSOR_TIME, hid_time_addresses[0],
diff --git a/drivers/spi/spi-bcm2835.c b/drivers/spi/spi-bcm2835.c
index 4c332143a310..3ed666fe840a 100644
--- a/drivers/spi/spi-bcm2835.c
+++ b/drivers/spi/spi-bcm2835.c
@@ -217,7 +217,7 @@ static int bcm2835_spi_start_transfer(struct spi_device *spi,
 		cs |= spi->chip_select;
 	}
 
-	INIT_COMPLETION(bs->done);
+	reinit_completion(&bs->done);
 	bs->tx_buf = tfr->tx_buf;
 	bs->rx_buf = tfr->rx_buf;
 	bs->len = tfr->len;
diff --git a/drivers/spi/spi-clps711x.c b/drivers/spi/spi-clps711x.c
index e2a5a426b2ef..6f03d7e6435d 100644
--- a/drivers/spi/spi-clps711x.c
+++ b/drivers/spi/spi-clps711x.c
@@ -105,7 +105,7 @@ static int spi_clps711x_transfer_one_message(struct spi_master *master,
 
 		gpio_set_value(cs, !!(msg->spi->mode & SPI_CS_HIGH));
 
-		INIT_COMPLETION(hw->done);
+		reinit_completion(&hw->done);
 
 		hw->count = 0;
 		hw->len = xfer->len;
diff --git a/drivers/spi/spi-davinci.c b/drivers/spi/spi-davinci.c
index dd72445ba2ea..50b2d88c8190 100644
--- a/drivers/spi/spi-davinci.c
+++ b/drivers/spi/spi-davinci.c
@@ -554,7 +554,7 @@ static int davinci_spi_bufs(struct spi_device *spi, struct spi_transfer *t)
 	clear_io_bits(dspi->base + SPIGCR1, SPIGCR1_POWERDOWN_MASK);
 	set_io_bits(dspi->base + SPIGCR1, SPIGCR1_SPIENA_MASK);
 
-	INIT_COMPLETION(dspi->done);
+	reinit_completion(&dspi->done);
 
 	if (spicfg->io_type == SPI_IO_TYPE_INTR)
 		set_io_bits(dspi->base + SPIINT, SPIINT_MASKINT);
diff --git a/drivers/spi/spi-fsl-espi.c b/drivers/spi/spi-fsl-espi.c
index 32200d4f8780..80d8f40f7e05 100644
--- a/drivers/spi/spi-fsl-espi.c
+++ b/drivers/spi/spi-fsl-espi.c
@@ -232,7 +232,7 @@ static int fsl_espi_bufs(struct spi_device *spi, struct spi_transfer *t)
 	mpc8xxx_spi->tx = t->tx_buf;
 	mpc8xxx_spi->rx = t->rx_buf;
 
-	INIT_COMPLETION(mpc8xxx_spi->done);
+	reinit_completion(&mpc8xxx_spi->done);
 
 	/* Set SPCOM[CS] and SPCOM[TRANLEN] field */
 	if ((t->len - 1) > SPCOM_TRANLEN_MAX) {
diff --git a/drivers/spi/spi-fsl-spi.c b/drivers/spi/spi-fsl-spi.c
index 2129fcd1c31b..119f7af94537 100644
--- a/drivers/spi/spi-fsl-spi.c
+++ b/drivers/spi/spi-fsl-spi.c
@@ -339,7 +339,7 @@ static int fsl_spi_bufs(struct spi_device *spi, struct spi_transfer *t,
 	mpc8xxx_spi->tx = t->tx_buf;
 	mpc8xxx_spi->rx = t->rx_buf;
 
-	INIT_COMPLETION(mpc8xxx_spi->done);
+	reinit_completion(&mpc8xxx_spi->done);
 
 	if (mpc8xxx_spi->flags & SPI_CPM_MODE)
 		ret = fsl_spi_cpm_bufs(mpc8xxx_spi, t, is_dma_mapped);
diff --git a/drivers/spi/spi-mpc512x-psc.c b/drivers/spi/spi-mpc512x-psc.c
index 58d5ee0e4443..9602bbd8d7ea 100644
--- a/drivers/spi/spi-mpc512x-psc.c
+++ b/drivers/spi/spi-mpc512x-psc.c
@@ -167,7 +167,7 @@ static int mpc512x_psc_spi_transfer_rxtx(struct spi_device *spi,
 			}
 
 			/* have the ISR trigger when the TX FIFO is empty */
-			INIT_COMPLETION(mps->txisrdone);
+			reinit_completion(&mps->txisrdone);
 			out_be32(&fifo->txisr, MPC512x_PSC_FIFO_EMPTY);
 			out_be32(&fifo->tximr, MPC512x_PSC_FIFO_EMPTY);
 			wait_for_completion(&mps->txisrdone);
diff --git a/drivers/spi/spi-mxs.c b/drivers/spi/spi-mxs.c
index de333059a9a7..73afb56c08cc 100644
--- a/drivers/spi/spi-mxs.c
+++ b/drivers/spi/spi-mxs.c
@@ -202,7 +202,7 @@ static int mxs_spi_txrx_dma(struct mxs_spi *spi,
 	if (!dma_xfer)
 		return -ENOMEM;
 
-	INIT_COMPLETION(spi->c);
+	reinit_completion(&spi->c);
 
 	/* Chip select was already programmed into CTRL0 */
 	ctrl0 = readl(ssp->base + HW_SSP_CTRL0);
diff --git a/drivers/spi/spi-s3c64xx.c b/drivers/spi/spi-s3c64xx.c
index 9e2020df9e0f..4c4b0a1219a7 100644
--- a/drivers/spi/spi-s3c64xx.c
+++ b/drivers/spi/spi-s3c64xx.c
@@ -890,7 +890,7 @@ static int s3c64xx_spi_transfer_one(struct spi_master *master,
 	unsigned long flags;
 	int use_dma;
 
-	INIT_COMPLETION(sdd->xfer_completion);
+		reinit_completion(&sdd->xfer_completion);
 
 	/* Only BPW and Speed may change across transfers */
 	bpw = xfer->bits_per_word;
diff --git a/drivers/spi/spi-sh-msiof.c b/drivers/spi/spi-sh-msiof.c
index 2a95435a6a11..c74298cf70e2 100644
--- a/drivers/spi/spi-sh-msiof.c
+++ b/drivers/spi/spi-sh-msiof.c
@@ -465,7 +465,7 @@ static int sh_msiof_spi_txrx_once(struct sh_msiof_spi_priv *p,
 	ret = ret ? ret : sh_msiof_modify_ctr_wait(p, 0, CTR_TXE);
 
 	/* start by setting frame bit */
-	INIT_COMPLETION(p->done);
+	reinit_completion(&p->done);
 	ret = ret ? ret : sh_msiof_modify_ctr_wait(p, 0, CTR_TFSE);
 	if (ret) {
 		dev_err(&p->pdev->dev, "failed to start hardware\n");
diff --git a/drivers/spi/spi-sirf.c b/drivers/spi/spi-sirf.c
index 592b4aff651f..ed5e501c4652 100644
--- a/drivers/spi/spi-sirf.c
+++ b/drivers/spi/spi-sirf.c
@@ -305,8 +305,8 @@ static int spi_sirfsoc_transfer(struct spi_device *spi, struct spi_transfer *t)
 	sspi->tx = t->tx_buf ? t->tx_buf : sspi->dummypage;
 	sspi->rx = t->rx_buf ? t->rx_buf : sspi->dummypage;
 	sspi->left_tx_word = sspi->left_rx_word = t->len / sspi->word_width;
-	INIT_COMPLETION(sspi->rx_done);
-	INIT_COMPLETION(sspi->tx_done);
+	reinit_completion(&sspi->rx_done);
+	reinit_completion(&sspi->tx_done);
 
 	writel(SIRFSOC_SPI_INT_MASK_ALL, sspi->base + SIRFSOC_SPI_INT_STATUS);
 
diff --git a/drivers/spi/spi-tegra114.c b/drivers/spi/spi-tegra114.c
index 9146bb3c2489..aaecfb3ebf58 100644
--- a/drivers/spi/spi-tegra114.c
+++ b/drivers/spi/spi-tegra114.c
@@ -451,7 +451,7 @@ static void tegra_spi_dma_complete(void *args)
 
 static int tegra_spi_start_tx_dma(struct tegra_spi_data *tspi, int len)
 {
-	INIT_COMPLETION(tspi->tx_dma_complete);
+	reinit_completion(&tspi->tx_dma_complete);
 	tspi->tx_dma_desc = dmaengine_prep_slave_single(tspi->tx_dma_chan,
 				tspi->tx_dma_phys, len, DMA_MEM_TO_DEV,
 				DMA_PREP_INTERRUPT |  DMA_CTRL_ACK);
@@ -470,7 +470,7 @@ static int tegra_spi_start_tx_dma(struct tegra_spi_data *tspi, int len)
 
 static int tegra_spi_start_rx_dma(struct tegra_spi_data *tspi, int len)
 {
-	INIT_COMPLETION(tspi->rx_dma_complete);
+	reinit_completion(&tspi->rx_dma_complete);
 	tspi->rx_dma_desc = dmaengine_prep_slave_single(tspi->rx_dma_chan,
 				tspi->rx_dma_phys, len, DMA_DEV_TO_MEM,
 				DMA_PREP_INTERRUPT |  DMA_CTRL_ACK);
@@ -844,7 +844,7 @@ static int tegra_spi_transfer_one_message(struct spi_master *master,
 	list_for_each_entry(xfer, &msg->transfers, transfer_list) {
 		unsigned long cmd1;
 
-		INIT_COMPLETION(tspi->xfer_completion);
+		reinit_completion(&tspi->xfer_completion);
 
 		cmd1 = tegra_spi_setup_transfer_one(spi, xfer, is_first_msg);
 
diff --git a/drivers/spi/spi-tegra20-sflash.c b/drivers/spi/spi-tegra20-sflash.c
index 79be8ce6a9d1..4dc8e8129459 100644
--- a/drivers/spi/spi-tegra20-sflash.c
+++ b/drivers/spi/spi-tegra20-sflash.c
@@ -339,7 +339,7 @@ static int tegra_sflash_transfer_one_message(struct spi_master *master,
 	msg->actual_length = 0;
 	single_xfer = list_is_singular(&msg->transfers);
 	list_for_each_entry(xfer, &msg->transfers, transfer_list) {
-		INIT_COMPLETION(tsd->xfer_completion);
+		reinit_completion(&tsd->xfer_completion);
 		ret = tegra_sflash_start_transfer_one(spi, xfer,
 					is_first_msg, single_xfer);
 		if (ret < 0) {
diff --git a/drivers/spi/spi-tegra20-slink.c b/drivers/spi/spi-tegra20-slink.c
index af0a67886ae8..e66715ba37ed 100644
--- a/drivers/spi/spi-tegra20-slink.c
+++ b/drivers/spi/spi-tegra20-slink.c
@@ -462,7 +462,7 @@ static void tegra_slink_dma_complete(void *args)
 
 static int tegra_slink_start_tx_dma(struct tegra_slink_data *tspi, int len)
 {
-	INIT_COMPLETION(tspi->tx_dma_complete);
+	reinit_completion(&tspi->tx_dma_complete);
 	tspi->tx_dma_desc = dmaengine_prep_slave_single(tspi->tx_dma_chan,
 				tspi->tx_dma_phys, len, DMA_MEM_TO_DEV,
 				DMA_PREP_INTERRUPT |  DMA_CTRL_ACK);
@@ -481,7 +481,7 @@ static int tegra_slink_start_tx_dma(struct tegra_slink_data *tspi, int len)
 
 static int tegra_slink_start_rx_dma(struct tegra_slink_data *tspi, int len)
 {
-	INIT_COMPLETION(tspi->rx_dma_complete);
+	reinit_completion(&tspi->rx_dma_complete);
 	tspi->rx_dma_desc = dmaengine_prep_slave_single(tspi->rx_dma_chan,
 				tspi->rx_dma_phys, len, DMA_DEV_TO_MEM,
 				DMA_PREP_INTERRUPT |  DMA_CTRL_ACK);
@@ -836,7 +836,7 @@ static int tegra_slink_transfer_one(struct spi_master *master,
 	struct tegra_slink_data *tspi = spi_master_get_devdata(master);
 	int ret;
 
-	INIT_COMPLETION(tspi->xfer_completion);
+	reinit_completion(&tspi->xfer_completion);
 	ret = tegra_slink_start_transfer_one(spi, xfer);
 	if (ret < 0) {
 		dev_err(tspi->dev,
diff --git a/drivers/spi/spi-xilinx.c b/drivers/spi/spi-xilinx.c
index ec3a83f52ea2..6d4ce4615163 100644
--- a/drivers/spi/spi-xilinx.c
+++ b/drivers/spi/spi-xilinx.c
@@ -258,7 +258,7 @@ static int xilinx_spi_txrx_bufs(struct spi_device *spi, struct spi_transfer *t)
 	xspi->tx_ptr = t->tx_buf;
 	xspi->rx_ptr = t->rx_buf;
 	xspi->remaining_bytes = t->len;
-	INIT_COMPLETION(xspi->done);
+	reinit_completion(&xspi->done);
 
 
 	/* Enable the transmit empty interrupt, which we use to determine
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 927998aa5e71..8d85ddc46011 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -571,7 +571,7 @@ static int spi_transfer_one_message(struct spi_master *master,
 	list_for_each_entry(xfer, &msg->transfers, transfer_list) {
 		trace_spi_transfer_start(msg, xfer);
 
-		INIT_COMPLETION(master->xfer_completion);
+		reinit_completion(&master->xfer_completion);
 
 		ret = master->transfer_one(master, msg->spi, xfer);
 		if (ret < 0) {
diff --git a/drivers/staging/iio/adc/mxs-lradc.c b/drivers/staging/iio/adc/mxs-lradc.c
index aeae76b77be5..e2dd7830b320 100644
--- a/drivers/staging/iio/adc/mxs-lradc.c
+++ b/drivers/staging/iio/adc/mxs-lradc.c
@@ -783,7 +783,7 @@ static int mxs_lradc_read_raw(struct iio_dev *iio_dev,
 	if (!ret)
 		return -EBUSY;
 
-	INIT_COMPLETION(lradc->completion);
+	reinit_completion(&lradc->completion);
 
 	/*
 	 * No buffered operation in progress, map the channel and trigger it.
diff --git a/drivers/staging/media/solo6x10/solo6x10-p2m.c b/drivers/staging/media/solo6x10/solo6x10-p2m.c
index 333594189b81..7f2f2472655b 100644
--- a/drivers/staging/media/solo6x10/solo6x10-p2m.c
+++ b/drivers/staging/media/solo6x10/solo6x10-p2m.c
@@ -87,7 +87,7 @@ int solo_p2m_dma_desc(struct solo_dev *solo_dev,
 	if (mutex_lock_interruptible(&p2m_dev->mutex))
 		return -EINTR;
 
-	INIT_COMPLETION(p2m_dev->completion);
+	reinit_completion(&p2m_dev->completion);
 	p2m_dev->error = 0;
 
 	if (desc_cnt > 1 && solo_dev->type != SOLO_DEV_6110 && desc_mode) {
diff --git a/drivers/staging/tidspbridge/core/sync.c b/drivers/staging/tidspbridge/core/sync.c
index 7bb550acaf4a..743ff09d82d2 100644
--- a/drivers/staging/tidspbridge/core/sync.c
+++ b/drivers/staging/tidspbridge/core/sync.c
@@ -72,7 +72,7 @@ int sync_wait_on_multiple_events(struct sync_object **events,
 	spin_lock_bh(&sync_lock);
 	for (i = 0; i < count; i++) {
 		if (completion_done(&events[i]->comp)) {
-			INIT_COMPLETION(events[i]->comp);
+			reinit_completion(&events[i]->comp);
 			*index = i;
 			spin_unlock_bh(&sync_lock);
 			status = 0;
@@ -92,7 +92,7 @@ int sync_wait_on_multiple_events(struct sync_object **events,
 	spin_lock_bh(&sync_lock);
 	for (i = 0; i < count; i++) {
 		if (completion_done(&events[i]->comp)) {
-			INIT_COMPLETION(events[i]->comp);
+			reinit_completion(&events[i]->comp);
 			*index = i;
 			status = 0;
 		}
diff --git a/drivers/staging/tidspbridge/include/dspbridge/sync.h b/drivers/staging/tidspbridge/include/dspbridge/sync.h
index 58a0d5c5543d..fc19b9707087 100644
--- a/drivers/staging/tidspbridge/include/dspbridge/sync.h
+++ b/drivers/staging/tidspbridge/include/dspbridge/sync.h
@@ -59,7 +59,7 @@ static inline void sync_init_event(struct sync_object *event)
 
 static inline void sync_reset_event(struct sync_object *event)
 {
-	INIT_COMPLETION(event->comp);
+	reinit_completion(&event->comp);
 	event->multi_comp = NULL;
 }
 
diff --git a/drivers/staging/tidspbridge/rmgr/drv_interface.c b/drivers/staging/tidspbridge/rmgr/drv_interface.c
index 6d04eb48bfbc..1aa4a3fd0f1b 100644
--- a/drivers/staging/tidspbridge/rmgr/drv_interface.c
+++ b/drivers/staging/tidspbridge/rmgr/drv_interface.c
@@ -332,7 +332,7 @@ static void bridge_recover(struct work_struct *work)
 	struct dev_object *dev;
 	struct cfg_devnode *dev_node;
 	if (atomic_read(&bridge_cref)) {
-		INIT_COMPLETION(bridge_comp);
+		reinit_completion(&bridge_comp);
 		while (!wait_for_completion_timeout(&bridge_comp,
 						msecs_to_jiffies(REC_TIMEOUT)))
 			pr_info("%s:%d handle(s) still opened\n",
@@ -348,7 +348,7 @@ static void bridge_recover(struct work_struct *work)
 
 void bridge_recover_schedule(void)
 {
-	INIT_COMPLETION(bridge_open_comp);
+	reinit_completion(&bridge_open_comp);
 	recover = true;
 	queue_work(bridge_rec_queue, &bridge_recovery_work);
 }
@@ -389,7 +389,7 @@ static int omap3_bridge_startup(struct platform_device *pdev)
 #ifdef CONFIG_TIDSPBRIDGE_RECOVERY
 	bridge_rec_queue = create_workqueue("bridge_rec_queue");
 	INIT_WORK(&bridge_recovery_work, bridge_recover);
-	INIT_COMPLETION(bridge_comp);
+	reinit_completion(&bridge_comp);
 #endif
 
 #ifdef CONFIG_PM
diff --git a/drivers/tty/metag_da.c b/drivers/tty/metag_da.c
index 0e888621f484..7332e2ca4615 100644
--- a/drivers/tty/metag_da.c
+++ b/drivers/tty/metag_da.c
@@ -495,7 +495,7 @@ static int dashtty_write(struct tty_struct *tty, const unsigned char *buf,
 	count = dport->xmit_cnt;
 	/* xmit buffer no longer empty? */
 	if (count)
-		INIT_COMPLETION(dport->xmit_empty);
+		reinit_completion(&dport->xmit_empty);
 	mutex_unlock(&dport->xmit_lock);
 
 	if (total) {
diff --git a/drivers/usb/c67x00/c67x00-sched.c b/drivers/usb/c67x00/c67x00-sched.c
index aa491627a45b..892cc96466eb 100644
--- a/drivers/usb/c67x00/c67x00-sched.c
+++ b/drivers/usb/c67x00/c67x00-sched.c
@@ -344,7 +344,7 @@ void c67x00_endpoint_disable(struct usb_hcd *hcd, struct usb_host_endpoint *ep)
 		/* it could happen that we reinitialize this completion, while
 		 * somebody was waiting for that completion.  The timeout and
 		 * while loop handle such cases, but this might be improved */
-		INIT_COMPLETION(c67x00->endpoint_disable);
+		reinit_completion(&c67x00->endpoint_disable);
 		c67x00_sched_kick(c67x00);
 		wait_for_completion_timeout(&c67x00->endpoint_disable, 1 * HZ);
 
diff --git a/drivers/usb/gadget/f_fs.c b/drivers/usb/gadget/f_fs.c
index 44cf775a8627..774e8b89cdb5 100644
--- a/drivers/usb/gadget/f_fs.c
+++ b/drivers/usb/gadget/f_fs.c
@@ -373,7 +373,7 @@ static int __ffs_ep0_queue_wait(struct ffs_data *ffs, char *data, size_t len)
 	if (req->buf == NULL)
 		req->buf = (void *)0xDEADBABE;
 
-	INIT_COMPLETION(ffs->ep0req_completion);
+	reinit_completion(&ffs->ep0req_completion);
 
 	ret = usb_ep_queue(ffs->gadget->ep0, req, GFP_ATOMIC);
 	if (unlikely(ret < 0))
diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index 84657e07dc5d..439c951f261b 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c
@@ -455,7 +455,7 @@ static int parport_prologue(struct parport *pp)
 		return -1;
 	}
 	mos_parport->msg_pending = true;   /* synch usb call pending */
-	INIT_COMPLETION(mos_parport->syncmsg_compl);
+	reinit_completion(&mos_parport->syncmsg_compl);
 	spin_unlock(&release_lock);
 
 	mutex_lock(&mos_parport->serial->disc_mutex);
diff --git a/drivers/video/exynos/exynos_mipi_dsi_common.c b/drivers/video/exynos/exynos_mipi_dsi_common.c
index 7eed957b6014..85edabfdef5a 100644
--- a/drivers/video/exynos/exynos_mipi_dsi_common.c
+++ b/drivers/video/exynos/exynos_mipi_dsi_common.c
@@ -220,7 +220,7 @@ int exynos_mipi_dsi_wr_data(struct mipi_dsim_device *dsim, unsigned int data_id,
 	case MIPI_DSI_DCS_LONG_WRITE:
 	{
 		unsigned int size, payload = 0;
-		INIT_COMPLETION(dsim_wr_comp);
+		reinit_completion(&dsim_wr_comp);
 
 		size = data_size * 4;
 
@@ -356,7 +356,7 @@ int exynos_mipi_dsi_rd_data(struct mipi_dsim_device *dsim, unsigned int data_id,
 	msleep(20);
 
 	mutex_lock(&dsim->lock);
-	INIT_COMPLETION(dsim_rd_comp);
+	reinit_completion(&dsim_rd_comp);
 	exynos_mipi_dsi_rd_tx_header(dsim,
 		MIPI_DSI_SET_MAXIMUM_RETURN_PACKET_SIZE, req_size);
 
diff --git a/drivers/video/omap2/displays-new/encoder-tpd12s015.c b/drivers/video/omap2/displays-new/encoder-tpd12s015.c
index 798ef200b055..d5c936cb217f 100644
--- a/drivers/video/omap2/displays-new/encoder-tpd12s015.c
+++ b/drivers/video/omap2/displays-new/encoder-tpd12s015.c
@@ -69,7 +69,7 @@ static int tpd_connect(struct omap_dss_device *dssdev,
 	dst->src = dssdev;
 	dssdev->dst = dst;
 
-	INIT_COMPLETION(ddata->hpd_completion);
+	reinit_completion(&ddata->hpd_completion);
 
 	gpio_set_value_cansleep(ddata->ct_cp_hpd_gpio, 1);
 	/* DC-DC converter needs at max 300us to get to 90% of 5V */
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 000eae2782b6..2f6735dbf1a9 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -392,7 +392,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
 
 		wait_for_completion(&ecr->completion);
 		rc = ecr->rc;
-		INIT_COMPLETION(ecr->completion);
+		reinit_completion(&ecr->completion);
 	}
 out:
 	ablkcipher_request_free(req);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index c8e729deb4f7..74a7e12e10df 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -244,7 +244,7 @@ static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl)
 	set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
 	spin_lock(&tbl->slot_tbl_lock);
 	if (tbl->highest_used_slotid != NFS4_NO_SLOT) {
-		INIT_COMPLETION(tbl->complete);
+		reinit_completion(&tbl->complete);
 		spin_unlock(&tbl->slot_tbl_lock);
 		return wait_for_completion_interruptible(&tbl->complete);
 	}
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 3a44a648dae7..3407b2c62b21 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1304,7 +1304,7 @@ static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
 {
 	wait_for_completion(&mw->mw_complete);
 	/* Re-arm the completion in case we want to wait on it again */
-	INIT_COMPLETION(mw->mw_complete);
+	reinit_completion(&mw->mw_complete);
 	return mw->mw_status;
 }
 
@@ -1355,7 +1355,7 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
 	else
 		ret = mw->mw_status;
 	/* Re-arm the completion in case we want to wait on it again */
-	INIT_COMPLETION(mw->mw_complete);
+	reinit_completion(&mw->mw_complete);
 	return ret;
 }
 
diff --git a/sound/firewire/dice.c b/sound/firewire/dice.c
index 6feee6614193..57bcd31fcc12 100644
--- a/sound/firewire/dice.c
+++ b/sound/firewire/dice.c
@@ -543,7 +543,7 @@ static int dice_change_rate(struct dice *dice, unsigned int clock_rate)
 	__be32 value;
 	int err;
 
-	INIT_COMPLETION(dice->clock_accepted);
+	reinit_completion(&dice->clock_accepted);
 
 	value = cpu_to_be32(clock_rate | CLOCK_SOURCE_ARX1);
 	err = snd_fw_transaction(dice->unit, TCODE_WRITE_QUADLET_REQUEST,
diff --git a/sound/soc/samsung/ac97.c b/sound/soc/samsung/ac97.c
index 2acf987844e8..350ba23a9893 100644
--- a/sound/soc/samsung/ac97.c
+++ b/sound/soc/samsung/ac97.c
@@ -74,7 +74,7 @@ static void s3c_ac97_activate(struct snd_ac97 *ac97)
 	if (stat == S3C_AC97_GLBSTAT_MAINSTATE_ACTIVE)
 		return; /* Return if already active */
 
-	INIT_COMPLETION(s3c_ac97.done);
+	reinit_completion(&s3c_ac97.done);
 
 	ac_glbctrl = readl(s3c_ac97.regs + S3C_AC97_GLBCTRL);
 	ac_glbctrl = S3C_AC97_GLBCTRL_ACLINKON;
@@ -103,7 +103,7 @@ static unsigned short s3c_ac97_read(struct snd_ac97 *ac97,
 
 	s3c_ac97_activate(ac97);
 
-	INIT_COMPLETION(s3c_ac97.done);
+	reinit_completion(&s3c_ac97.done);
 
 	ac_codec_cmd = readl(s3c_ac97.regs + S3C_AC97_CODEC_CMD);
 	ac_codec_cmd = S3C_AC97_CODEC_CMD_READ | AC_CMD_ADDR(reg);
@@ -140,7 +140,7 @@ static void s3c_ac97_write(struct snd_ac97 *ac97, unsigned short reg,
 
 	s3c_ac97_activate(ac97);
 
-	INIT_COMPLETION(s3c_ac97.done);
+	reinit_completion(&s3c_ac97.done);
 
 	ac_codec_cmd = readl(s3c_ac97.regs + S3C_AC97_CODEC_CMD);
 	ac_codec_cmd = AC_CMD_ADDR(reg) | AC_CMD_DATA(val);
-- 
cgit v1.3


From a99b7069aab8fc3fb4f26d15795dc280b52e38b1 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Thu, 14 Nov 2013 14:32:18 -0800
Subject: hfsplus: Fix undefined __divdi3 in hfsplus_init_header_node()

ERROR: "__divdi3" [fs/hfsplus/hfsplus.ko] undefined!

Introduced by commit 099e9245e04d ("hfsplus: implement attributes file's
header node initialization code").

i_size_read() returns loff_t, which is long long, i.e.  64-bit.  node_size
is size_t, which is either 32-bit or 64-bit.  Hence
"i_size_read(attr_file) / node_size" is a 64-by-32 or 64-by-64 division,
causing (some versions of) gcc to emit a call to __divdi3().

Fortunately node_size is actually 16-bit, as the sole caller of
hfsplus_init_header_node() passes a u16.  Hence change its type from
size_t to u16, and use do_div() to perform a 64-by-32 division.

Not seen in m68k/allmodconfig in -next, so it really depends on the
verion of gcc.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Vyacheslav Dubeyko <slava@dubeyko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/xattr.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index efc85b1377cc..3c6136f98c73 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -129,7 +129,7 @@ static int can_set_xattr(struct inode *inode, const char *name,
 
 static void hfsplus_init_header_node(struct inode *attr_file,
 					u32 clump_size,
-					char *buf, size_t node_size)
+					char *buf, u16 node_size)
 {
 	struct hfs_bnode_desc *desc;
 	struct hfs_btree_header_rec *head;
@@ -139,8 +139,9 @@ static void hfsplus_init_header_node(struct inode *attr_file,
 	char *bmp;
 	u32 used_nodes;
 	u32 used_bmp_bytes;
+	loff_t tmp;
 
-	hfs_dbg(ATTR_MOD, "init_hdr_attr_file: clump %u, node_size %zu\n",
+	hfs_dbg(ATTR_MOD, "init_hdr_attr_file: clump %u, node_size %u\n",
 				clump_size, node_size);
 
 	/* The end of the node contains list of record offsets */
@@ -154,7 +155,9 @@ static void hfsplus_init_header_node(struct inode *attr_file,
 
 	head = (struct hfs_btree_header_rec *)(buf + offset);
 	head->node_size = cpu_to_be16(node_size);
-	head->node_count = cpu_to_be32(i_size_read(attr_file) / node_size);
+	tmp = i_size_read(attr_file);
+	do_div(tmp, node_size);
+	head->node_count = cpu_to_be32(tmp);
 	head->free_nodes = cpu_to_be32(be32_to_cpu(head->node_count) - 1);
 	head->clump_size = cpu_to_be32(clump_size);
 	head->attributes |= cpu_to_be32(HFS_TREE_BIGKEYS | HFS_TREE_VARIDXKEYS);
-- 
cgit v1.3


From 46e0f66a0cf9e1fe25bfdcf4a60c08aface85998 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@fusionio.com>
Date: Fri, 15 Nov 2013 12:14:55 +0100
Subject: btrfs: fix empty_zero_page misusage

Heiko Carstens noticed that btrfs was using empty_zero_page
incorrectly.  He explained:

	The definition of empty_zero_page is architecture specific.  It
	is (currently) either a character array, an unsigned long
	containing the address of the empty_zero_page, or even worse
	only the address of the struct page belonging to the
	empty_zero_page.

This commit changes btrfs to use a for-loop instead.  On x86
the resulting .ko is smaller, and we're no longer worrying about
how each arch builds its zeros.

Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ioctl.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 1d04b5559e61..64a0876c7872 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -368,8 +368,13 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 
 int btrfs_is_empty_uuid(u8 *uuid)
 {
-	BUILD_BUG_ON(BTRFS_UUID_SIZE > PAGE_SIZE);
-	return !memcmp(uuid, empty_zero_page, BTRFS_UUID_SIZE);
+	int i;
+
+	for (i = 0; i < BTRFS_UUID_SIZE; i++) {
+		if (uuid[i])
+			return 0;
+	}
+	return 1;
 }
 
 static noinline int create_subvol(struct inode *dir,
-- 
cgit v1.3


From 54563d41a58be77e9bd9ef7af1ea4026cf0e7e07 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 1 Sep 2013 15:57:51 -0400
Subject: btrfs: get rid of fdentry()

3 of 4 callers actually want file_inode()...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ctree.h | 5 -----
 fs/btrfs/ioctl.c | 8 ++++----
 2 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index aea4433081dc..f9aeb2759a64 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3111,11 +3111,6 @@ static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
 	((unsigned long)(btrfs_leaf_data(leaf) + \
 	btrfs_item_offset_nr(leaf, slot)))
 
-static inline struct dentry *fdentry(struct file *file)
-{
-	return file->f_path.dentry;
-}
-
 static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
 {
 	return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 64a0876c7872..a111622598b0 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -320,7 +320,7 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
 
 static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb);
+	struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb);
 	struct btrfs_device *device;
 	struct request_queue *q;
 	struct fstrim_range range;
@@ -2089,7 +2089,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
 static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 					     void __user *arg)
 {
-	struct dentry *parent = fdentry(file);
+	struct dentry *parent = file->f_path.dentry;
 	struct dentry *dentry;
 	struct inode *dir = parent->d_inode;
 	struct inode *inode;
@@ -3105,7 +3105,7 @@ out:
 static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 				       u64 off, u64 olen, u64 destoff)
 {
-	struct inode *inode = fdentry(file)->d_inode;
+	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct fd src_file;
 	struct inode *src;
@@ -4304,7 +4304,7 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
 
 static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
 {
-	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
-- 
cgit v1.3


From 9e08ef1afb109c2059e7c1e4755fcafd9be37260 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 14 Nov 2013 13:00:17 +1100
Subject: NFS: correctly report misuse of "migration" mount option.

The current test on valid use of the "migration" mount option can never
report an error as it will only do so if
    mnt->version !=4 && mnt->minor_version != 0
(and some other condition), but if that test would succeed, then the previous
test has already gone-to  out_minorversion_mismatch.

So change the && to an || to get correct semantics.

Signed-off-by: NeilBrown <neilb@suse.de>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 317d6fc2160e..910ed906eb82 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1614,7 +1614,7 @@ static int nfs_parse_mount_options(char *raw,
 		goto out_minorversion_mismatch;
 
 	if (mnt->options & NFS_OPTION_MIGRATION &&
-	    mnt->version != 4 && mnt->minorversion != 0)
+	    (mnt->version != 4 || mnt->minorversion != 0))
 		goto out_migration_misuse;
 
 	/*
-- 
cgit v1.3


From 8c2fabc6542d9d0f8b16bd1045c2eda59bdcde13 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 13 Nov 2013 08:50:33 -0800
Subject: nfs: fix pnfs Kconfig defaults

Defaulting to m seem to prevent building the pnfs layout modules into the
kernel.  Default to the value of CONFIG_NFS_V4 make sure they are
built in for built-in NFSv4 support and modular for a modular NFSv4.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/Kconfig | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 38c1768b4142..3dece03f2fc8 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -116,17 +116,17 @@ config NFS_V4_2
 config PNFS_FILE_LAYOUT
 	tristate
 	depends on NFS_V4_1
-	default m
+	default NFS_V4
 
 config PNFS_BLOCK
 	tristate
 	depends on NFS_V4_1 && BLK_DEV_DM
-	default m
+	default NFS_V4
 
 config PNFS_OBJLAYOUT
 	tristate
 	depends on NFS_V4_1 && SCSI_OSD_ULD
-	default m
+	default NFS_V4
 
 config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
 	string "NFSv4.1 Implementation ID Domain"
-- 
cgit v1.3


From de9f68df675e9d8b0b6a533818a79dca6546a7ba Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Fri, 15 Nov 2013 11:26:24 -0600
Subject: [CIFS] Set copychunk defaults

Patch 2 of the copy chunk series (the final patch will
use these to handle copies of files larger than the chunk size.

We set the same defaults that Windows and Samba expect for
CopyChunk.

Signed-off-by: Steve French <smfrench@gmail.com>
Reviewed-by: David Disseldorp <ddiss@samba.org>
---
 fs/cifs/cifsglob.h |  3 +++
 fs/cifs/smb2pdu.c  | 10 +++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 50a6ca1bb521..d9ea7ada1378 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -858,6 +858,9 @@ struct cifs_tcon {
 	__le64 vol_create_time;
 	__u32 ss_flags;		/* sector size flags */
 	__u32 perf_sector_size; /* best sector size for perf */
+	__u32 max_chunks;
+	__u32 max_bytes_chunk;
+	__u32 max_bytes_copy;
 #endif /* CONFIG_CIFS_SMB2 */
 #ifdef CONFIG_CIFS_FSCACHE
 	u64 resource_id;		/* server resource id */
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 8ab05b0d6778..ca7f307ad97a 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -717,6 +717,14 @@ static inline void cifs_stats_fail_inc(struct cifs_tcon *tcon, uint16_t code)
 
 #define MAX_SHARENAME_LENGTH (255 /* server */ + 80 /* share */ + 1 /* NULL */)
 
+/* These are similar values to what Windows uses */
+static inline void init_copy_chunk_defaults(struct cifs_tcon *tcon)
+{
+	tcon->max_chunks = 256;
+	tcon->max_bytes_chunk = 1048576;
+	tcon->max_bytes_copy = 16777216;
+}
+
 int
 SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
 	  struct cifs_tcon *tcon, const struct nls_table *cp)
@@ -818,7 +826,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
 	if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
 	    ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0))
 		cifs_dbg(VFS, "DFS capability contradicts DFS flag\n");
-
+	init_copy_chunk_defaults(tcon);
 tcon_exit:
 	free_rsp_buf(resp_buftype, rsp);
 	kfree(unc_path);
-- 
cgit v1.3


From b1d93356427be6f050dc55c86eb019d173700af6 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Fri, 15 Nov 2013 20:41:32 -0600
Subject: setfacl removes part of ACL when setting POSIX ACLs to Samba

setfacl over cifs mounts can remove the default ACL when setting the
(non-default part of) the ACL and vice versa (we were leaving at 0
rather than setting to -1 the count field for the unaffected
half of the ACL.  For example notice the setfacl removed
the default ACL in this sequence:

steven@steven-GA-970A-DS3:~/cifs-2.6$ getfacl /mnt/test-dir ; setfacl
-m default:user:test:rwx,user:test:rwx /mnt/test-dir
getfacl: Removing leading '/' from absolute path names
user::rwx
group::r-x
other::r-x
default:user::rwx
default:user:test:rwx
default:group::r-x
default:mask::rwx
default:other::r-x

steven@steven-GA-970A-DS3:~/cifs-2.6$ getfacl /mnt/test-dir
getfacl: Removing leading '/' from absolute path names
user::rwx
user:test:rwx
group::r-x
mask::rwx
other::r-x

CC: Stable <stable@kernel.org>
Signed-off-by: Steve French <smfrench@gmail.com>
Acked-by: Jeremy Allison <jra@samba.org>
---
 fs/cifs/cifssmb.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 93b29474714a..124aa0230c1b 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3369,11 +3369,13 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
 		return 0;
 	}
 	cifs_acl->version = cpu_to_le16(1);
-	if (acl_type == ACL_TYPE_ACCESS)
+	if (acl_type == ACL_TYPE_ACCESS) {
 		cifs_acl->access_entry_count = cpu_to_le16(count);
-	else if (acl_type == ACL_TYPE_DEFAULT)
+		cifs_acl->default_entry_count = __constant_cpu_to_le16(0xFFFF);
+	} else if (acl_type == ACL_TYPE_DEFAULT) {
 		cifs_acl->default_entry_count = cpu_to_le16(count);
-	else {
+		cifs_acl->access_entry_count = __constant_cpu_to_le16(0xFFFF);
+	} else {
 		cifs_dbg(FYI, "unknown ACL type %d\n", acl_type);
 		return 0;
 	}
-- 
cgit v1.3


From db51242d89b3059a46a3cf2f3339f8cd975cb954 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 15 Nov 2013 21:55:52 -0500
Subject: dump_align(): fix the dumb braino

Mea culpa - original variant used 64-by-32-bit division,
which got caught very late.  Getting rid of that wasn't
hard, but I'd managed to botch the calling conventions
in process ;-/

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/coredump.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/coredump.c b/fs/coredump.c
index 62406b6959b6..a2856f7bb613 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -733,7 +733,7 @@ int dump_align(struct coredump_params *cprm, int align)
 {
 	unsigned mod = cprm->written & (align - 1);
 	if (align & (align - 1))
-		return -EINVAL;
-	return mod ? dump_skip(cprm, align - mod) : 0;
+		return 0;
+	return mod ? dump_skip(cprm, align - mod) : 1;
 }
 EXPORT_SYMBOL(dump_align);
-- 
cgit v1.3


From 52da40ae67f6192a3bf70a98cb560e1423554953 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 15 Nov 2013 21:58:33 -0500
Subject: dump_emit(): use __kernel_write(), not vfs_write()

the caller has already done file_start_write()...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/coredump.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/coredump.c b/fs/coredump.c
index a2856f7bb613..bc3fbcd32558 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -695,7 +695,7 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
 	while (nr) {
 		if (dump_interrupted())
 			return 0;
-		n = vfs_write(file, addr, nr, &pos);
+		n = __kernel_write(file, addr, nr, &pos);
 		if (n <= 0)
 			return 0;
 		file->f_pos = pos;
-- 
cgit v1.3


From 951b4bd553e35a291e6b5732ab0124619e81da05 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 2 Jun 2013 19:53:40 -0400
Subject: gfs2: endianness misannotations

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/gfs2/lock_dlm.c |  8 ++++----
 fs/gfs2/quota.c    | 23 ++++++++++-------------
 fs/gfs2/rgrp.c     |  4 ++--
 3 files changed, 16 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index c8423d6de6c3..2a6ba06bee6f 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -466,19 +466,19 @@ static void gdlm_cancel(struct gfs2_glock *gl)
 static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen,
 			     char *lvb_bits)
 {
-	uint32_t gen;
+	__le32 gen;
 	memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE);
-	memcpy(&gen, lvb_bits, sizeof(uint32_t));
+	memcpy(&gen, lvb_bits, sizeof(__le32));
 	*lvb_gen = le32_to_cpu(gen);
 }
 
 static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen,
 			      char *lvb_bits)
 {
-	uint32_t gen;
+	__le32 gen;
 	memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE);
 	gen = cpu_to_le32(lvb_gen);
-	memcpy(ls->ls_control_lvb, &gen, sizeof(uint32_t));
+	memcpy(ls->ls_control_lvb, &gen, sizeof(__le32));
 }
 
 static int all_jid_bits_clear(char *lvb)
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 453b50eaddec..98236d0df3ca 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -667,7 +667,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 	struct buffer_head *bh;
 	struct page *page;
 	void *kaddr, *ptr;
-	struct gfs2_quota q, *qp;
+	struct gfs2_quota q;
 	int err, nbytes;
 	u64 size;
 
@@ -683,28 +683,25 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 		return err;
 
 	err = -EIO;
-	qp = &q;
-	qp->qu_value = be64_to_cpu(qp->qu_value);
-	qp->qu_value += change;
-	qp->qu_value = cpu_to_be64(qp->qu_value);
-	qd->qd_qb.qb_value = qp->qu_value;
+	be64_add_cpu(&q.qu_value, change);
+	qd->qd_qb.qb_value = q.qu_value;
 	if (fdq) {
 		if (fdq->d_fieldmask & FS_DQ_BSOFT) {
-			qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift);
-			qd->qd_qb.qb_warn = qp->qu_warn;
+			q.qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift);
+			qd->qd_qb.qb_warn = q.qu_warn;
 		}
 		if (fdq->d_fieldmask & FS_DQ_BHARD) {
-			qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
-			qd->qd_qb.qb_limit = qp->qu_limit;
+			q.qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
+			qd->qd_qb.qb_limit = q.qu_limit;
 		}
 		if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
-			qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
-			qd->qd_qb.qb_value = qp->qu_value;
+			q.qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
+			qd->qd_qb.qb_value = q.qu_value;
 		}
 	}
 
 	/* Write the quota into the quota file on disk */
-	ptr = qp;
+	ptr = &q;
 	nbytes = sizeof(struct gfs2_quota);
 get_a_page:
 	page = find_or_create_page(mapping, index, GFP_NOFS);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 4d83abdd5635..c8d6161bd682 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1127,7 +1127,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
 		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
 		rgd->rd_free_clone = rgd->rd_free;
 	}
-	if (be32_to_cpu(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
+	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
 		rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
 		gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
 				     rgd->rd_bits[0].bi_bh->b_data);
@@ -1161,7 +1161,7 @@ int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
 	if (rgd->rd_flags & GFS2_RDF_UPTODATE)
 		return 0;
 
-	if (be32_to_cpu(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
+	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
 		return gfs2_rgrp_bh_get(rgd);
 
 	rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
-- 
cgit v1.3


From b26d4cd385fc51e8844e2cdf9ba2051f5bba11a5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Oct 2013 18:47:37 -0400
Subject: consolidate simple ->d_delete() instances

Rename simple_delete_dentry() to always_delete_dentry() and export it.
Export simple_dentry_operations, while we are at it, and get rid of
their duplicates

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/ia64/kernel/perfmon.c |  8 +-------
 fs/9p/vfs_dentry.c         | 19 +------------------
 fs/configfs/dir.c          | 12 +-----------
 fs/efivarfs/super.c        | 11 +----------
 fs/hostfs/hostfs_kern.c    | 11 +----------
 fs/libfs.c                 | 12 +++++++-----
 fs/proc/generic.c          | 18 +-----------------
 fs/proc/namespaces.c       |  8 +-------
 include/linux/fs.h         |  2 ++
 kernel/cgroup.c            |  7 +------
 net/sunrpc/rpc_pipe.c      | 11 +----------
 11 files changed, 18 insertions(+), 101 deletions(-)

(limited to 'fs')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 5a9ff1c3c3e9..cb592773c78b 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2166,12 +2166,6 @@ static const struct file_operations pfm_file_ops = {
 	.flush		= pfm_flush
 };
 
-static int
-pfmfs_delete_dentry(const struct dentry *dentry)
-{
-	return 1;
-}
-
 static char *pfmfs_dname(struct dentry *dentry, char *buffer, int buflen)
 {
 	return dynamic_dname(dentry, buffer, buflen, "pfm:[%lu]",
@@ -2179,7 +2173,7 @@ static char *pfmfs_dname(struct dentry *dentry, char *buffer, int buflen)
 }
 
 static const struct dentry_operations pfmfs_dentry_operations = {
-	.d_delete = pfmfs_delete_dentry,
+	.d_delete = always_delete_dentry,
 	.d_dname = pfmfs_dname,
 };
 
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index f039b104a98e..b03dd23feda8 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -42,23 +42,6 @@
 #include "v9fs_vfs.h"
 #include "fid.h"
 
-/**
- * v9fs_dentry_delete - called when dentry refcount equals 0
- * @dentry:  dentry in question
- *
- * By returning 1 here we should remove cacheing of unused
- * dentry components.
- *
- */
-
-static int v9fs_dentry_delete(const struct dentry *dentry)
-{
-	p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
-		 dentry->d_name.name, dentry);
-
-	return 1;
-}
-
 /**
  * v9fs_cached_dentry_delete - called when dentry refcount equals 0
  * @dentry:  dentry in question
@@ -134,6 +117,6 @@ const struct dentry_operations v9fs_cached_dentry_operations = {
 };
 
 const struct dentry_operations v9fs_dentry_operations = {
-	.d_delete = v9fs_dentry_delete,
+	.d_delete = always_delete_dentry,
 	.d_release = v9fs_dentry_release,
 };
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 277bd1be21fd..4522e0755773 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -66,19 +66,9 @@ static void configfs_d_iput(struct dentry * dentry,
 	iput(inode);
 }
 
-/*
- * We _must_ delete our dentries on last dput, as the chain-to-parent
- * behavior is required to clear the parents of default_groups.
- */
-static int configfs_d_delete(const struct dentry *dentry)
-{
-	return 1;
-}
-
 const struct dentry_operations configfs_dentry_ops = {
 	.d_iput		= configfs_d_iput,
-	/* simple_delete_dentry() isn't exported */
-	.d_delete	= configfs_d_delete,
+	.d_delete	= always_delete_dentry,
 };
 
 #ifdef CONFIG_LOCKDEP
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index a8766b880c07..becc725a1953 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -83,19 +83,10 @@ static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr)
 	return 0;
 }
 
-/*
- * Retaining negative dentries for an in-memory filesystem just wastes
- * memory and lookup time: arrange for them to be deleted immediately.
- */
-static int efivarfs_delete_dentry(const struct dentry *dentry)
-{
-	return 1;
-}
-
 static struct dentry_operations efivarfs_d_ops = {
 	.d_compare = efivarfs_d_compare,
 	.d_hash = efivarfs_d_hash,
-	.d_delete = efivarfs_delete_dentry,
+	.d_delete = always_delete_dentry,
 };
 
 static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name)
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 25437280a207..db23ce1bd903 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -33,15 +33,6 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
 
 #define FILE_HOSTFS_I(file) HOSTFS_I(file_inode(file))
 
-static int hostfs_d_delete(const struct dentry *dentry)
-{
-	return 1;
-}
-
-static const struct dentry_operations hostfs_dentry_ops = {
-	.d_delete		= hostfs_d_delete,
-};
-
 /* Changed in hostfs_args before the kernel starts running */
 static char *root_ino = "";
 static int append = 0;
@@ -925,7 +916,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
 	sb->s_blocksize_bits = 10;
 	sb->s_magic = HOSTFS_SUPER_MAGIC;
 	sb->s_op = &hostfs_sbops;
-	sb->s_d_op = &hostfs_dentry_ops;
+	sb->s_d_op = &simple_dentry_operations;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 
 	/* NULL is printed as <NULL> by sprintf: avoid that. */
diff --git a/fs/libfs.c b/fs/libfs.c
index 5de06947ba5e..a1844244246f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -47,10 +47,16 @@ EXPORT_SYMBOL(simple_statfs);
  * Retaining negative dentries for an in-memory filesystem just wastes
  * memory and lookup time: arrange for them to be deleted immediately.
  */
-static int simple_delete_dentry(const struct dentry *dentry)
+int always_delete_dentry(const struct dentry *dentry)
 {
 	return 1;
 }
+EXPORT_SYMBOL(always_delete_dentry);
+
+const struct dentry_operations simple_dentry_operations = {
+	.d_delete = always_delete_dentry,
+};
+EXPORT_SYMBOL(simple_dentry_operations);
 
 /*
  * Lookup the data. This is trivial - if the dentry didn't already
@@ -58,10 +64,6 @@ static int simple_delete_dentry(const struct dentry *dentry)
  */
 struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
-	static const struct dentry_operations simple_dentry_operations = {
-		.d_delete = simple_delete_dentry,
-	};
-
 	if (dentry->d_name.len > NAME_MAX)
 		return ERR_PTR(-ENAMETOOLONG);
 	if (!dentry->d_sb->s_d_op)
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 737e15615b04..cca93b6fb9a9 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -174,22 +174,6 @@ static const struct inode_operations proc_link_inode_operations = {
 	.follow_link	= proc_follow_link,
 };
 
-/*
- * As some entries in /proc are volatile, we want to 
- * get rid of unused dentries.  This could be made 
- * smarter: we could keep a "volatile" flag in the 
- * inode to indicate which ones to keep.
- */
-static int proc_delete_dentry(const struct dentry * dentry)
-{
-	return 1;
-}
-
-static const struct dentry_operations proc_dentry_operations =
-{
-	.d_delete	= proc_delete_dentry,
-};
-
 /*
  * Don't create negative dentries here, return -ENOENT by hand
  * instead.
@@ -209,7 +193,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 			inode = proc_get_inode(dir->i_sb, de);
 			if (!inode)
 				return ERR_PTR(-ENOMEM);
-			d_set_d_op(dentry, &proc_dentry_operations);
+			d_set_d_op(dentry, &simple_dentry_operations);
 			d_add(dentry, inode);
 			return NULL;
 		}
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 49a7fff2e83a..9ae46b87470d 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -42,12 +42,6 @@ static const struct inode_operations ns_inode_operations = {
 	.setattr	= proc_setattr,
 };
 
-static int ns_delete_dentry(const struct dentry *dentry)
-{
-	/* Don't cache namespace inodes when not in use */
-	return 1;
-}
-
 static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
 {
 	struct inode *inode = dentry->d_inode;
@@ -59,7 +53,7 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
 
 const struct dentry_operations ns_dentry_operations =
 {
-	.d_delete	= ns_delete_dentry,
+	.d_delete	= always_delete_dentry,
 	.d_dname	= ns_dname,
 };
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bf5d574ebdf4..121f11f001c0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2622,7 +2622,9 @@ extern int simple_write_begin(struct file *file, struct address_space *mapping,
 extern int simple_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata);
+extern int always_delete_dentry(const struct dentry *);
 extern struct inode *alloc_anon_inode(struct super_block *);
+extern const struct dentry_operations simple_dentry_operations;
 
 extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags);
 extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e0839bcd48c8..4c62513fe19f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -895,11 +895,6 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 	iput(inode);
 }
 
-static int cgroup_delete(const struct dentry *d)
-{
-	return 1;
-}
-
 static void remove_dir(struct dentry *d)
 {
 	struct dentry *parent = dget(d->d_parent);
@@ -1486,7 +1481,7 @@ static int cgroup_get_rootdir(struct super_block *sb)
 {
 	static const struct dentry_operations cgroup_dops = {
 		.d_iput = cgroup_diput,
-		.d_delete = cgroup_delete,
+		.d_delete = always_delete_dentry,
 	};
 
 	struct inode *inode =
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index d0d14a04dce1..bf04b30a788a 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -471,15 +471,6 @@ struct rpc_filelist {
 	umode_t mode;
 };
 
-static int rpc_delete_dentry(const struct dentry *dentry)
-{
-	return 1;
-}
-
-static const struct dentry_operations rpc_dentry_operations = {
-	.d_delete = rpc_delete_dentry,
-};
-
 static struct inode *
 rpc_get_inode(struct super_block *sb, umode_t mode)
 {
@@ -1266,7 +1257,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	sb->s_magic = RPCAUTH_GSSMAGIC;
 	sb->s_op = &s_ops;
-	sb->s_d_op = &rpc_dentry_operations;
+	sb->s_d_op = &simple_dentry_operations;
 	sb->s_time_gran = 1;
 
 	inode = rpc_get_inode(sb, S_IFDIR | S_IRUGO | S_IXUGO);
-- 
cgit v1.3


From 2bc74feba12fbf052ec97aee8624c9f13593a9ac Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Oct 2013 16:39:14 -0400
Subject: take read_seqbegin_or_lock() and friends to seqlock.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c             | 29 -----------------------------
 include/linux/seqlock.h | 29 +++++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 0a38ef8d7f00..667e23ab0b4c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -88,35 +88,6 @@ EXPORT_SYMBOL(rename_lock);
 
 static struct kmem_cache *dentry_cache __read_mostly;
 
-/**
- * read_seqbegin_or_lock - begin a sequence number check or locking block
- * @lock: sequence lock
- * @seq : sequence number to be checked
- *
- * First try it once optimistically without taking the lock. If that fails,
- * take the lock. The sequence number is also used as a marker for deciding
- * whether to be a reader (even) or writer (odd).
- * N.B. seq must be initialized to an even number to begin with.
- */
-static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
-{
-	if (!(*seq & 1))	/* Even */
-		*seq = read_seqbegin(lock);
-	else			/* Odd */
-		read_seqlock_excl(lock);
-}
-
-static inline int need_seqretry(seqlock_t *lock, int seq)
-{
-	return !(seq & 1) && read_seqretry(lock, seq);
-}
-
-static inline void done_seqretry(seqlock_t *lock, int seq)
-{
-	if (seq & 1)
-		read_sequnlock_excl(lock);
-}
-
 /*
  * This is the single most critical data structure when it comes
  * to the dcache: the hashtable for lookups. Somebody should try
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 1e8a8b6e837d..cf87a24c0f92 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -354,6 +354,35 @@ static inline void read_sequnlock_excl(seqlock_t *sl)
 	spin_unlock(&sl->lock);
 }
 
+/**
+ * read_seqbegin_or_lock - begin a sequence number check or locking block
+ * @lock: sequence lock
+ * @seq : sequence number to be checked
+ *
+ * First try it once optimistically without taking the lock. If that fails,
+ * take the lock. The sequence number is also used as a marker for deciding
+ * whether to be a reader (even) or writer (odd).
+ * N.B. seq must be initialized to an even number to begin with.
+ */
+static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
+{
+	if (!(*seq & 1))	/* Even */
+		*seq = read_seqbegin(lock);
+	else			/* Odd */
+		read_seqlock_excl(lock);
+}
+
+static inline int need_seqretry(seqlock_t *lock, int seq)
+{
+	return !(seq & 1) && read_seqretry(lock, seq);
+}
+
+static inline void done_seqretry(seqlock_t *lock, int seq)
+{
+	if (seq & 1)
+		read_sequnlock_excl(lock);
+}
+
 static inline void read_seqlock_excl_bh(seqlock_t *sl)
 {
 	spin_lock_bh(&sl->lock);
-- 
cgit v1.3


From 482db9066199813d6b999b65a3171afdbec040b6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Oct 2013 16:41:01 -0400
Subject: dcache.c: get rid of pointless macros

D_HASH{MASK,BITS} are used once each, both in the same function (d_hash()).
At this point they are actively misguiding - they imply that values are
compiler constants, which is no longer true.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 667e23ab0b4c..7f079d00475a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -96,8 +96,6 @@ static struct kmem_cache *dentry_cache __read_mostly;
  * This hash-function tries to avoid losing too many bits of hash
  * information, yet avoid using a prime hash-size or similar.
  */
-#define D_HASHBITS     d_hash_shift
-#define D_HASHMASK     d_hash_mask
 
 static unsigned int d_hash_mask __read_mostly;
 static unsigned int d_hash_shift __read_mostly;
@@ -108,8 +106,8 @@ static inline struct hlist_bl_head *d_hash(const struct dentry *parent,
 					unsigned int hash)
 {
 	hash += (unsigned long) parent / L1_CACHE_BYTES;
-	hash = hash + (hash >> D_HASHBITS);
-	return dentry_hashtable + (hash & D_HASHMASK);
+	hash = hash + (hash >> d_hash_shift);
+	return dentry_hashtable + (hash & d_hash_mask);
 }
 
 /* Statistics gathering. */
-- 
cgit v1.3


From 31dec1327e377b6d91a8a6c92b5cd8513939a233 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Oct 2013 17:04:27 -0400
Subject: fold try_to_ascend() into the sole remaining caller

There used to be a bunch of tree-walkers in dcache.c, all alike.
try_to_ascend() had been introduced to abstract a piece of logics
duplicated in all of them.  These days all these tree-walkers are
implemented via the same iterator (d_walk()), which is the only
remaining caller of try_to_ascend(), so let's fold it back...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 49 ++++++++++++++++++-------------------------------
 1 file changed, 18 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 7f079d00475a..4bdb300b16e2 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -438,7 +438,7 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
 {
 	list_del(&dentry->d_u.d_child);
 	/*
-	 * Inform try_to_ascend() that we are no longer attached to the
+	 * Inform d_walk() that we are no longer attached to the
 	 * dentry tree
 	 */
 	dentry->d_flags |= DCACHE_DENTRY_KILLED;
@@ -1038,34 +1038,6 @@ void shrink_dcache_sb(struct super_block *sb)
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
 
-/*
- * This tries to ascend one level of parenthood, but
- * we can race with renaming, so we need to re-check
- * the parenthood after dropping the lock and check
- * that the sequence number still matches.
- */
-static struct dentry *try_to_ascend(struct dentry *old, unsigned seq)
-{
-	struct dentry *new = old->d_parent;
-
-	rcu_read_lock();
-	spin_unlock(&old->d_lock);
-	spin_lock(&new->d_lock);
-
-	/*
-	 * might go back up the wrong parent if we have had a rename
-	 * or deletion
-	 */
-	if (new != old->d_parent ||
-		 (old->d_flags & DCACHE_DENTRY_KILLED) ||
-		 need_seqretry(&rename_lock, seq)) {
-		spin_unlock(&new->d_lock);
-		new = NULL;
-	}
-	rcu_read_unlock();
-	return new;
-}
-
 /**
  * enum d_walk_ret - action to talke during tree walk
  * @D_WALK_CONTINUE:	contrinue walk
@@ -1154,9 +1126,24 @@ resume:
 	 */
 	if (this_parent != parent) {
 		struct dentry *child = this_parent;
-		this_parent = try_to_ascend(this_parent, seq);
-		if (!this_parent)
+		this_parent = child->d_parent;
+
+		rcu_read_lock();
+		spin_unlock(&child->d_lock);
+		spin_lock(&this_parent->d_lock);
+
+		/*
+		 * might go back up the wrong parent if we have had a rename
+		 * or deletion
+		 */
+		if (this_parent != child->d_parent ||
+			 (child->d_flags & DCACHE_DENTRY_KILLED) ||
+			 need_seqretry(&rename_lock, seq)) {
+			spin_unlock(&this_parent->d_lock);
+			rcu_read_unlock();
 			goto rename_retry;
+		}
+		rcu_read_unlock();
 		next = child->d_u.d_child.next;
 		goto resume;
 	}
-- 
cgit v1.3


From 0cbaa53cdd33080c1e2d67ad9295b83c7954f2b3 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Fri, 15 Nov 2013 23:50:24 -0600
Subject: [CIFS] Warn if SMB3 encryption required by server

We do not support SMB3 encryption yet, warn if server responds
that SMB3 encryption is mandatory.

Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/smb2pdu.c |  2 ++
 fs/cifs/smb2pdu.h | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index ca7f307ad97a..d65270c290a1 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -630,6 +630,8 @@ ssetup_ntlmssp_authenticate:
 		goto ssetup_exit;
 
 	ses->session_flags = le16_to_cpu(rsp->SessionFlags);
+	if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
+		cifs_dbg(VFS, "SMB3 encryption not supported yet\n");
 ssetup_exit:
 	free_rsp_buf(resp_buftype, rsp);
 
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index b50a129572cd..f88320bbb477 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -122,6 +122,23 @@ struct smb2_pdu {
 	__le16 StructureSize2; /* size of wct area (varies, request specific) */
 } __packed;
 
+struct smb2_transform_hdr {
+	__be32 smb2_buf_length;	/* big endian on wire */
+				/* length is only two or three bytes - with
+				 one or two byte type preceding it that MBZ */
+	__u8   ProtocolId[4];	/* 0xFD 'S' 'M' 'B' */
+	__u8   Signature[16];
+	__u8   Nonce[11];
+	__u8   Reserved[5];
+	__le32 OriginalMessageSize;
+	__u16  Reserved1;
+	__le16 EncryptionAlgorithm;
+	__u64  SessionId;
+} __packed;
+
+/* Encryption Algorithms */
+#define SMB2_ENCRYPTION_AES128_CCM	__constant_cpu_to_le16(0x0001)
+
 /*
  *	SMB2 flag definitions
  */
@@ -237,6 +254,7 @@ struct smb2_sess_setup_req {
 /* Currently defined SessionFlags */
 #define SMB2_SESSION_FLAG_IS_GUEST	0x0001
 #define SMB2_SESSION_FLAG_IS_NULL	0x0002
+#define SMB2_SESSION_FLAG_ENCRYPT_DATA	0x0004
 struct smb2_sess_setup_rsp {
 	struct smb2_hdr hdr;
 	__le16 StructureSize; /* Must be 9 */
-- 
cgit v1.3


From 9e3908e342eba6684621e616529669c17e271e7e Mon Sep 17 00:00:00 2001
From: Mark Tinguely <tinguely@sgi.com>
Date: Thu, 7 Nov 2013 15:43:28 -0600
Subject: xfs: fix unlock in xfs_bmap_add_attrfork

xfs_trans_ijoin() activates the inode in a transaction and
also can specify which lock to free when the transaction is
committed or canceled.

xfs_bmap_add_attrfork call locks and adds the lock to the
transaction but also manually removes the lock. Change the
routine to not add the lock to the transaction and manually
remove lock on completion.

While here, clean up the xfs_trans_cancel flags and goto names.

Signed-off-by: Mark Tinguely <tinguely@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_bmap.c | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 1c02da8bb7df..3ef11b22e750 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1137,6 +1137,7 @@ xfs_bmap_add_attrfork(
 	int			committed;	/* xaction was committed */
 	int			logflags;	/* logging flags */
 	int			error;		/* error return value */
+	int			cancel_flags = 0;
 
 	ASSERT(XFS_IFORK_Q(ip) == 0);
 
@@ -1147,19 +1148,20 @@ xfs_bmap_add_attrfork(
 	if (rsvd)
 		tp->t_flags |= XFS_TRANS_RESERVE;
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
-	if (error)
-		goto error0;
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
 			XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
 			XFS_QMOPT_RES_REGBLKS);
-	if (error) {
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
-		return error;
-	}
+	if (error)
+		goto trans_cancel;
+	cancel_flags |= XFS_TRANS_ABORT;
 	if (XFS_IFORK_Q(ip))
-		goto error1;
+		goto trans_cancel;
 	if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
 		/*
 		 * For inodes coming from pre-6.2 filesystems.
@@ -1169,7 +1171,7 @@ xfs_bmap_add_attrfork(
 	}
 	ASSERT(ip->i_d.di_anextents == 0);
 
-	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
 	switch (ip->i_d.di_format) {
@@ -1191,7 +1193,7 @@ xfs_bmap_add_attrfork(
 	default:
 		ASSERT(0);
 		error = XFS_ERROR(EINVAL);
-		goto error1;
+		goto trans_cancel;
 	}
 
 	ASSERT(ip->i_afp == NULL);
@@ -1219,7 +1221,7 @@ xfs_bmap_add_attrfork(
 	if (logflags)
 		xfs_trans_log_inode(tp, ip, logflags);
 	if (error)
-		goto error2;
+		goto bmap_cancel;
 	if (!xfs_sb_version_hasattr(&mp->m_sb) ||
 	   (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
 		__int64_t sbfields = 0;
@@ -1242,14 +1244,16 @@ xfs_bmap_add_attrfork(
 
 	error = xfs_bmap_finish(&tp, &flist, &committed);
 	if (error)
-		goto error2;
-	return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-error2:
+		goto bmap_cancel;
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+
+bmap_cancel:
 	xfs_bmap_cancel(&flist);
-error1:
+trans_cancel:
+	xfs_trans_cancel(tp, cancel_flags);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-error0:
-	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
 	return error;
 }
 
-- 
cgit v1.3


From 8f80587bacb6eb893df0ee4e35fefa0dfcfdf9f4 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 1 Nov 2013 15:27:20 +1100
Subject: xfs: increase inode cluster size for v5 filesystems

v5 filesystems use 512 byte inodes as a minimum, so read inodes in
clusters that are effectively half the size of a v4 filesystem with
256 byte inodes. For v5 fielsystems, scale the inode cluster size
with the size of the inode so that we keep a constant 32 inodes per
cluster ratio for all inode IO.

This only works if mkfs.xfs sets the inode alignment appropriately
for larger inode clusters, so this functionality is made conditional
on mkfs doing the right thing. xfs_repair needs to know about
the inode alignment changes, too.

Wall time:
	create	bulkstat	find+stat	ls -R	unlink
v4	237s	161s		173s		201s	299s
v5	235s	163s		205s		 31s	356s
patched	234s	160s		182s		 29s	317s

System time:
	create	bulkstat	find+stat	ls -R	unlink
v4	2601s	2490s		1653s		1656s	2960s
v5	2637s	2497s		1681s		  20s	3216s
patched	2613s	2451s		1658s		  20s	3007s

So, wall time same or down across the board, system time same or
down across the board, and cache hit rates all improve except for
the ls -R case which is a pure cold cache directory read workload
on v5 filesystems...

So, this patch removes most of the performance and CPU usage
differential between v4 and v5 filesystems on traversal related
workloads.

Note: while this patch is currently for v5 filesystems only, there
is no reason it can't be ported back to v4 filesystems.  This hasn't
been done here because bringing the code back to v4 requires
forwards and backwards kernel compatibility testing.  i.e. to
deterine if older kernels(*) do the right thing with larger inode
alignments but still only using 8k inode cluster sizes. None of this
testing and validation on v4 filesystems has been done, so for the
moment larger inode clusters is limited to v5 superblocks.

(*) a current default config v4 filesystem should mount just fine on
2.6.23 (when lazy-count support was introduced), and so if we change
the alignment emitted by mkfs without a feature bit then we have to
make sure it works properly on all kernels since 2.6.23. And if we
allow it to be changed when the lazy-count bit is not set, then it's
all kernels since v2 logs were introduced that need to be tested for
compatibility...

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_mount.c      | 15 +++++++++++++++
 fs/xfs/xfs_mount.h      |  2 +-
 fs/xfs/xfs_trans_resv.c |  3 +--
 3 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index da88f167af78..02df7b408a26 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -41,6 +41,7 @@
 #include "xfs_fsops.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
+#include "xfs_dinode.h"
 
 
 #ifdef HAVE_PERCPU_SB
@@ -718,8 +719,22 @@ xfs_mountfs(
 	 * Set the inode cluster size.
 	 * This may still be overridden by the file system
 	 * block size if it is larger than the chosen cluster size.
+	 *
+	 * For v5 filesystems, scale the cluster size with the inode size to
+	 * keep a constant ratio of inode per cluster buffer, but only if mkfs
+	 * has set the inode alignment value appropriately for larger cluster
+	 * sizes.
 	 */
 	mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		int	new_size = mp->m_inode_cluster_size;
+
+		new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
+		if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
+			mp->m_inode_cluster_size = new_size;
+		xfs_info(mp, "Using inode cluster size of %d bytes",
+			 mp->m_inode_cluster_size);
+	}
 
 	/*
 	 * Set inode alignment fields
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1d8101a10d8e..a466c5e5826e 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -112,7 +112,7 @@ typedef struct xfs_mount {
 	__uint8_t		m_blkbb_log;	/* blocklog - BBSHIFT */
 	__uint8_t		m_agno_log;	/* log #ag's */
 	__uint8_t		m_agino_log;	/* #bits for agino in inum */
-	__uint16_t		m_inode_cluster_size;/* min inode buf size */
+	uint			m_inode_cluster_size;/* min inode buf size */
 	uint			m_blockmask;	/* sb_blocksize-1 */
 	uint			m_blockwsize;	/* sb_blocksize in words */
 	uint			m_blockwmask;	/* blockwsize-1 */
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
index d53d9f0627a7..2fd59c0dae66 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/xfs_trans_resv.c
@@ -385,8 +385,7 @@ xfs_calc_ifree_reservation(
 		xfs_calc_inode_res(mp, 1) +
 		xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
 		xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
-		MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
-		    XFS_INODE_CLUSTER_SIZE(mp)) +
+		max_t(uint, XFS_FSB_TO_B(mp, 1), XFS_INODE_CLUSTER_SIZE(mp)) +
 		xfs_calc_buf_res(1, 0) +
 		xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
 				 mp->m_in_maxlevels, 0) +
-- 
cgit v1.3


From 2fe8c1c08b3fbd87dd2641c8f032ff6e965d5803 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 1 Nov 2013 15:27:17 +1100
Subject: xfs: open code inc_inode_iversion when logging an inode

Michael L Semon reported that generic/069 runtime increased on v5
superblocks by 100% compared to v4 superblocks. his perf-based
analysis pointed directly at the timestamp updates being done by the
write path in this workload. The append writers are doing 4-byte
writes, so there are lots of timestamp updates occurring.

The thing is, they aren't being triggered by timestamp changes -
they are being triggered by the inode change counter needing to be
updated. That is, every write(2) system call needs to bump the inode
version count, and it does that through the timestamp update
mechanism. Hence for v5 filesystems, test generic/069 is running 3
orders of magnitude more timestmap update transactions on v5
filesystems due to the fact it does a huge number of *4 byte*
write(2) calls.

This isn't a real world scenario we really need to address - anyone
doing such sequential IO should be using fwrite(3), not write(2).
i.e. fwrite(3) buffers the writes in userspace to minimise the
number of write(2) syscalls, and the problem goes away.

However, there is a small change we can make to improve the
situation - removing the expensive lock operation on the change
counter update.  All inode version counter changes in XFS occur
under the ip->i_ilock during a transaction, and therefore we
don't actually need the spin lock that provides exclusive access to
it through inc_inode_iversion().

Hence avoid the lock and just open code the increment ourselves when
logging the inode.

Reported-by: Michael L. Semon <mlsemon35@gmail.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_trans_inode.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 1bba7f60d94c..50c3f5614288 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -111,12 +111,14 @@ xfs_trans_log_inode(
 
 	/*
 	 * First time we log the inode in a transaction, bump the inode change
-	 * counter if it is configured for this to occur.
+	 * counter if it is configured for this to occur. We don't use
+	 * inode_inc_version() because there is no need for extra locking around
+	 * i_version as we already hold the inode locked exclusively for
+	 * metadata modification.
 	 */
 	if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
 	    IS_I_VERSION(VFS_I(ip))) {
-		inode_inc_iversion(VFS_I(ip));
-		ip->i_d.di_changecount = VFS_I(ip)->i_version;
+		ip->i_d.di_changecount = ++VFS_I(ip)->i_version;
 		flags |= XFS_ILOG_CORE;
 	}
 
-- 
cgit v1.3


From 818e5a22e907fbae75e9c1fd78233baec9fa64b6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 18 Nov 2013 05:07:30 -0800
Subject: nfsd: split up nfsd_setattr

Split out two helpers to make the code more readable and easier to verify
for correctness.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: stable@vger.kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/vfs.c | 144 ++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 84 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 94b5f5d2bfed..c3f57cf14a63 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -298,41 +298,12 @@ commit_metadata(struct svc_fh *fhp)
 }
 
 /*
- * Set various file attributes.
- * N.B. After this call fhp needs an fh_put
+ * Go over the attributes and take care of the small differences between
+ * NFS semantics and what Linux expects.
  */
-__be32
-nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
-	     int check_guard, time_t guardtime)
+static void
+nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap)
 {
-	struct dentry	*dentry;
-	struct inode	*inode;
-	int		accmode = NFSD_MAY_SATTR;
-	umode_t		ftype = 0;
-	__be32		err;
-	int		host_err;
-	int		size_change = 0;
-
-	if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
-		accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
-	if (iap->ia_valid & ATTR_SIZE)
-		ftype = S_IFREG;
-
-	/* Get inode */
-	err = fh_verify(rqstp, fhp, ftype, accmode);
-	if (err)
-		goto out;
-
-	dentry = fhp->fh_dentry;
-	inode = dentry->d_inode;
-
-	/* Ignore any mode updates on symlinks */
-	if (S_ISLNK(inode->i_mode))
-		iap->ia_valid &= ~ATTR_MODE;
-
-	if (!iap->ia_valid)
-		goto out;
-
 	/*
 	 * NFSv2 does not differentiate between "set-[ac]time-to-now"
 	 * which only requires access, and "set-[ac]time-to-X" which
@@ -342,8 +313,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 	 * convert to "set to now" instead of "set to explicit time"
 	 *
 	 * We only call inode_change_ok as the last test as technically
-	 * it is not an interface that we should be using.  It is only
-	 * valid if the filesystem does not define it's own i_op->setattr.
+	 * it is not an interface that we should be using.
 	 */
 #define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET)
 #define	MAX_TOUCH_TIME_ERROR (30*60)
@@ -369,30 +339,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 			iap->ia_valid &= ~BOTH_TIME_SET;
 		}
 	}
-	    
-	/*
-	 * The size case is special.
-	 * It changes the file as well as the attributes.
-	 */
-	if (iap->ia_valid & ATTR_SIZE) {
-		if (iap->ia_size < inode->i_size) {
-			err = nfsd_permission(rqstp, fhp->fh_export, dentry,
-					NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE);
-			if (err)
-				goto out;
-		}
-
-		host_err = get_write_access(inode);
-		if (host_err)
-			goto out_nfserr;
-
-		size_change = 1;
-		host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
-		if (host_err) {
-			put_write_access(inode);
-			goto out_nfserr;
-		}
-	}
 
 	/* sanitize the mode change */
 	if (iap->ia_valid & ATTR_MODE) {
@@ -415,8 +361,86 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 			iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID);
 		}
 	}
+}
 
-	/* Change the attributes. */
+static __be32
+nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		struct iattr *iap)
+{
+	struct inode *inode = fhp->fh_dentry->d_inode;
+	int host_err;
+
+	if (iap->ia_size < inode->i_size) {
+		__be32 err;
+
+		err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
+				NFSD_MAY_TRUNC | NFSD_MAY_OWNER_OVERRIDE);
+		if (err)
+			return err;
+	}
+
+	host_err = get_write_access(inode);
+	if (host_err)
+		goto out_nfserrno;
+
+	host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
+	if (host_err)
+		goto out_put_write_access;
+	return 0;
+
+out_put_write_access:
+	put_write_access(inode);
+out_nfserrno:
+	return nfserrno(host_err);
+}
+
+/*
+ * Set various file attributes.  After this call fhp needs an fh_put.
+ */
+__be32
+nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
+	     int check_guard, time_t guardtime)
+{
+	struct dentry	*dentry;
+	struct inode	*inode;
+	int		accmode = NFSD_MAY_SATTR;
+	umode_t		ftype = 0;
+	__be32		err;
+	int		host_err;
+	int		size_change = 0;
+
+	if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
+		accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
+	if (iap->ia_valid & ATTR_SIZE)
+		ftype = S_IFREG;
+
+	/* Get inode */
+	err = fh_verify(rqstp, fhp, ftype, accmode);
+	if (err)
+		goto out;
+
+	dentry = fhp->fh_dentry;
+	inode = dentry->d_inode;
+
+	/* Ignore any mode updates on symlinks */
+	if (S_ISLNK(inode->i_mode))
+		iap->ia_valid &= ~ATTR_MODE;
+
+	if (!iap->ia_valid)
+		goto out;
+
+	nfsd_sanitize_attrs(inode, iap);
+
+	/*
+	 * The size case is special, it changes the file in addition to the
+	 * attributes.
+	 */
+	if (iap->ia_valid & ATTR_SIZE) {
+		err = nfsd_get_write_access(rqstp, fhp, iap);
+		if (err)
+			goto out;
+		size_change = 1;
+	}
 
 	iap->ia_valid |= ATTR_CTIME;
 
-- 
cgit v1.3


From 987da4791052fa298b7cfcde4dea9f6f2bbc786b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 18 Nov 2013 05:07:47 -0800
Subject: nfsd: make sure to balance get/put_write_access

Use a straight goto error label style in nfsd_setattr to make sure
we always do the put_write_access call after we got it earlier.

Note that the we have been failing to do that in the case
nfsd_break_lease() returns an error, a bug introduced into 2.6.38 with
6a76bebefe15d9a08864f824d7f8d5beaf37c997 "nfsd4: break lease on nfsd
setattr".

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: stable@vger.kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/vfs.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c3f57cf14a63..7eea63cada1d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -444,27 +444,28 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 
 	iap->ia_valid |= ATTR_CTIME;
 
-	err = nfserr_notsync;
-	if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
-		host_err = nfsd_break_lease(inode);
-		if (host_err)
-			goto out_nfserr;
-		fh_lock(fhp);
-
-		host_err = notify_change(dentry, iap, NULL);
-		err = nfserrno(host_err);
-		fh_unlock(fhp);
+	if (check_guard && guardtime != inode->i_ctime.tv_sec) {
+		err = nfserr_notsync;
+		goto out_put_write_access;
 	}
+
+	host_err = nfsd_break_lease(inode);
+	if (host_err)
+		goto out_put_write_access_nfserror;
+
+	fh_lock(fhp);
+	host_err = notify_change(dentry, iap, NULL);
+	fh_unlock(fhp);
+
+out_put_write_access_nfserror:
+	err = nfserrno(host_err);
+out_put_write_access:
 	if (size_change)
 		put_write_access(inode);
 	if (!err)
 		commit_metadata(fhp);
 out:
 	return err;
-
-out_nfserr:
-	err = nfserrno(host_err);
-	goto out;
 }
 
 #if defined(CONFIG_NFSD_V2_ACL) || \
-- 
cgit v1.3


From 34f2fd8dfe6185b0eaaf7d661281713a6170b077 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Mon, 18 Nov 2013 22:11:42 +0900
Subject: bio: fix argument of __bio_add_page() for max_sectors > 0xffff

The data type of max_sectors and max_hw_sectors in queue settings are
unsigned int.  But these values are passed to __bio_add_page() as an
argument whose data type is unsigned short.  In the worst case such as
max_sectors is 0x10000, bio_add_page() can't add a page and IOs can't
proceed.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 2bdb4e25ee77..33d79a4eb92d 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -601,7 +601,7 @@ EXPORT_SYMBOL(bio_get_nr_vecs);
 
 static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 			  *page, unsigned int len, unsigned int offset,
-			  unsigned short max_sectors)
+			  unsigned int max_sectors)
 {
 	int retried_segments = 0;
 	struct bio_vec *bvec;
-- 
cgit v1.3


From 801a76050bcf8d4e500eb8d048ff6265f37a61c8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Tue, 19 Nov 2013 01:20:43 +0000
Subject: seq_file: always clear m->count when we free m->buf

Once we'd freed m->buf, m->count should become zero - we have no valid
contents reachable via m->buf.

Reported-by: Charley (Hao Chuan) Chu <charley.chu@broadcom.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 1cd2388ca5bd..1d641bb108d2 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -136,6 +136,7 @@ static int traverse(struct seq_file *m, loff_t offset)
 Eoverflow:
 	m->op->stop(m, p);
 	kfree(m->buf);
+	m->count = 0;
 	m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
 	return !m->buf ? -ENOMEM : -EAGAIN;
 }
@@ -232,10 +233,10 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 			goto Fill;
 		m->op->stop(m, p);
 		kfree(m->buf);
+		m->count = 0;
 		m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
 		if (!m->buf)
 			goto Enomem;
-		m->count = 0;
 		m->version = 0;
 		pos = m->index;
 		p = m->op->start(m, &pos);
-- 
cgit v1.3


From c53ed7423619b4e8108914a9f31b426dd58ad591 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 19 Nov 2013 15:19:31 +0100
Subject: genetlink: only pass array to genl_register_family_with_ops()

As suggested by David Miller, make genl_register_family_with_ops()
a macro and pass only the array, evaluating ARRAY_SIZE() in the
macro, this is a little safer.

The openvswitch has some indirection, assing ops/n_ops directly in
that code. This might ultimately just assign the pointers in the
family initializations, saving the struct genl_family_and_ops and
code (once mcast groups are handled differently.)

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/team/team.c               |  3 +--
 drivers/net/wireless/mac80211_hwsim.c |  3 +--
 fs/dlm/netlink.c                      | 10 ++++++----
 include/linux/genl_magic_func.h       |  3 +--
 include/net/genetlink.h               |  8 ++++++--
 kernel/taskstats.c                    |  3 +--
 net/core/drop_monitor.c               |  3 +--
 net/hsr/hsr_netlink.c                 |  3 +--
 net/ieee802154/netlink.c              |  3 +--
 net/ipv4/tcp_metrics.c                |  3 +--
 net/irda/irnetlink.c                  |  3 +--
 net/l2tp/l2tp_netlink.c               |  7 +------
 net/netfilter/ipvs/ip_vs_ctl.c        |  2 +-
 net/netlabel/netlabel_cipso_v4.c      |  2 +-
 net/netlabel/netlabel_mgmt.c          |  2 +-
 net/netlabel/netlabel_unlabeled.c     |  2 +-
 net/netlink/genetlink.c               | 14 ++++++++------
 net/nfc/netlink.c                     |  3 +--
 net/openvswitch/datapath.c            |  5 +++--
 net/tipc/netlink.c                    | 11 ++++++-----
 net/wimax/stack.c                     |  4 ++--
 net/wireless/nl80211.c                |  3 +--
 22 files changed, 47 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 6390254beb7d..f55758b0840e 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -2699,8 +2699,7 @@ static int team_nl_init(void)
 {
 	int err;
 
-	err = genl_register_family_with_ops(&team_nl_family, team_nl_ops,
-					    ARRAY_SIZE(team_nl_ops));
+	err = genl_register_family_with_ops(&team_nl_family, team_nl_ops);
 	if (err)
 		return err;
 
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index cfc3fda79a2d..9df7bc91a26f 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -2148,8 +2148,7 @@ static int hwsim_init_netlink(void)
 
 	printk(KERN_INFO "mac80211_hwsim: initializing netlink\n");
 
-	rc = genl_register_family_with_ops(&hwsim_genl_family,
-		hwsim_ops, ARRAY_SIZE(hwsim_ops));
+	rc = genl_register_family_with_ops(&hwsim_genl_family, hwsim_ops);
 	if (rc)
 		goto failure;
 
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 60a327863b11..e7cfbaf8d0e2 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -74,14 +74,16 @@ static int user_cmd(struct sk_buff *skb, struct genl_info *info)
 	return 0;
 }
 
-static struct genl_ops dlm_nl_ops = {
-	.cmd		= DLM_CMD_HELLO,
-	.doit		= user_cmd,
+static struct genl_ops dlm_nl_ops[] = {
+	{
+		.cmd	= DLM_CMD_HELLO,
+		.doit	= user_cmd,
+	},
 };
 
 int __init dlm_netlink_init(void)
 {
-	return genl_register_family_with_ops(&family, &dlm_nl_ops, 1);
+	return genl_register_family_with_ops(&family, dlm_nl_ops);
 }
 
 void dlm_netlink_exit(void)
diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h
index 023bc346b877..47086030ab31 100644
--- a/include/linux/genl_magic_func.h
+++ b/include/linux/genl_magic_func.h
@@ -293,8 +293,7 @@ static int CONCAT_(GENL_MAGIC_FAMILY, _genl_multicast_ ## group)(	\
 
 int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void)
 {
-	int err = genl_register_family_with_ops(&ZZZ_genl_family,
-		ZZZ_genl_ops, ARRAY_SIZE(ZZZ_genl_ops));
+	int err = genl_register_family_with_ops(&ZZZ_genl_family, ZZZ_genl_ops);
 	if (err)
 		return err;
 #undef GENL_mc_group
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index e96385d46b48..9bd52a4c5e17 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -152,8 +152,9 @@ static inline int genl_register_family(struct genl_family *family)
  *
  * Return 0 on success or a negative error code.
  */
-static inline int genl_register_family_with_ops(struct genl_family *family,
-	const struct genl_ops *ops, size_t n_ops)
+static inline int _genl_register_family_with_ops(struct genl_family *family,
+						 const struct genl_ops *ops,
+						 size_t n_ops)
 {
 	family->module = THIS_MODULE;
 	family->ops = ops;
@@ -161,6 +162,9 @@ static inline int genl_register_family_with_ops(struct genl_family *family,
 	return __genl_register_family(family);
 }
 
+#define genl_register_family_with_ops(family, ops)	\
+	_genl_register_family_with_ops((family), (ops), ARRAY_SIZE(ops))
+
 int genl_unregister_family(struct genl_family *family);
 int genl_register_mc_group(struct genl_family *family,
 			   struct genl_multicast_group *grp);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 76595cd9d211..13d2f7cd65db 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -703,8 +703,7 @@ static int __init taskstats_init(void)
 {
 	int rc;
 
-	rc = genl_register_family_with_ops(&family, taskstats_ops,
-					   ARRAY_SIZE(taskstats_ops));
+	rc = genl_register_family_with_ops(&family, taskstats_ops);
 	if (rc)
 		return rc;
 
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index f9fe2f22d20b..0efc5028ba9d 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -365,8 +365,7 @@ static int __init init_net_drop_monitor(void)
 	}
 
 	rc = genl_register_family_with_ops(&net_drop_monitor_family,
-					   dropmon_ops,
-					   ARRAY_SIZE(dropmon_ops));
+					   dropmon_ops);
 	if (rc) {
 		pr_err("Could not create drop monitor netlink family\n");
 		return rc;
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index 3b9205d2afc4..f182260be76d 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -414,8 +414,7 @@ int __init hsr_netlink_init(void)
 	if (rc)
 		goto fail_rtnl_link_register;
 
-	rc = genl_register_family_with_ops(&hsr_genl_family, hsr_ops,
-					   ARRAY_SIZE(hsr_ops));
+	rc = genl_register_family_with_ops(&hsr_genl_family, hsr_ops);
 	if (rc)
 		goto fail_genl_register_family;
 
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
index 3ffcdbb56aab..1a81709a4717 100644
--- a/net/ieee802154/netlink.c
+++ b/net/ieee802154/netlink.c
@@ -129,8 +129,7 @@ int __init ieee802154_nl_init(void)
 {
 	int rc;
 
-	rc = genl_register_family_with_ops(&nl802154_family, ieee8021154_ops,
-					   ARRAY_SIZE(ieee8021154_ops));
+	rc = genl_register_family_with_ops(&nl802154_family, ieee8021154_ops);
 	if (rc)
 		return rc;
 
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 8c121b523eee..06493736fbc8 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -1082,8 +1082,7 @@ void __init tcp_metrics_init(void)
 	if (ret < 0)
 		goto cleanup;
 	ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
-					    tcp_metrics_nl_ops,
-					    ARRAY_SIZE(tcp_metrics_nl_ops));
+					    tcp_metrics_nl_ops);
 	if (ret < 0)
 		goto cleanup_subsys;
 	return;
diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c
index bf5d7d476dae..a37b81fe0479 100644
--- a/net/irda/irnetlink.c
+++ b/net/irda/irnetlink.c
@@ -149,8 +149,7 @@ static const struct genl_ops irda_nl_ops[] = {
 
 int irda_nl_register(void)
 {
-	return genl_register_family_with_ops(&irda_nl_family,
-		irda_nl_ops, ARRAY_SIZE(irda_nl_ops));
+	return genl_register_family_with_ops(&irda_nl_family, irda_nl_ops);
 }
 
 void irda_nl_unregister(void)
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 57db66e24f1f..4cfd722e9153 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -887,13 +887,8 @@ EXPORT_SYMBOL_GPL(l2tp_nl_unregister_ops);
 
 static int l2tp_nl_init(void)
 {
-	int err;
-
 	pr_info("L2TP netlink interface\n");
-	err = genl_register_family_with_ops(&l2tp_nl_family, l2tp_nl_ops,
-					    ARRAY_SIZE(l2tp_nl_ops));
-
-	return err;
+	return genl_register_family_with_ops(&l2tp_nl_family, l2tp_nl_ops);
 }
 
 static void l2tp_nl_cleanup(void)
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index fc8a04ed8854..393498704691 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3666,7 +3666,7 @@ static const struct genl_ops ip_vs_genl_ops[] __read_mostly = {
 static int __init ip_vs_genl_register(void)
 {
 	return genl_register_family_with_ops(&ip_vs_genl_family,
-		ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops));
+					     ip_vs_genl_ops);
 }
 
 static void ip_vs_genl_unregister(void)
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index 706691739b99..69345cebe3a3 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -783,5 +783,5 @@ static const struct genl_ops netlbl_cipsov4_ops[] = {
 int __init netlbl_cipsov4_genl_init(void)
 {
 	return genl_register_family_with_ops(&netlbl_cipsov4_gnl_family,
-		netlbl_cipsov4_ops, ARRAY_SIZE(netlbl_cipsov4_ops));
+					     netlbl_cipsov4_ops);
 }
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index 7de6f660b80a..8ef83ee97c6a 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -779,5 +779,5 @@ static const struct genl_ops netlbl_mgmt_genl_ops[] = {
 int __init netlbl_mgmt_genl_init(void)
 {
 	return genl_register_family_with_ops(&netlbl_mgmt_gnl_family,
-		netlbl_mgmt_genl_ops, ARRAY_SIZE(netlbl_mgmt_genl_ops));
+					     netlbl_mgmt_genl_ops);
 }
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 76ee9252daa7..43817d73ccf9 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -1397,7 +1397,7 @@ static const struct genl_ops netlbl_unlabel_genl_ops[] = {
 int __init netlbl_unlabel_genl_init(void)
 {
 	return genl_register_family_with_ops(&netlbl_unlabel_gnl_family,
-		netlbl_unlabel_genl_ops, ARRAY_SIZE(netlbl_unlabel_genl_ops));
+					     netlbl_unlabel_genl_ops);
 }
 
 /*
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index f54215d7b8f3..c68ce73619b5 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -906,11 +906,13 @@ static int genl_ctrl_event(int event, void *data)
 	return 0;
 }
 
-static struct genl_ops genl_ctrl_ops = {
-	.cmd		= CTRL_CMD_GETFAMILY,
-	.doit		= ctrl_getfamily,
-	.dumpit		= ctrl_dumpfamily,
-	.policy		= ctrl_policy,
+static struct genl_ops genl_ctrl_ops[] = {
+	{
+		.cmd		= CTRL_CMD_GETFAMILY,
+		.doit		= ctrl_getfamily,
+		.dumpit		= ctrl_dumpfamily,
+		.policy		= ctrl_policy,
+	},
 };
 
 static struct genl_multicast_group notify_grp = {
@@ -954,7 +956,7 @@ static int __init genl_init(void)
 	for (i = 0; i < GENL_FAM_TAB_SIZE; i++)
 		INIT_LIST_HEAD(&family_ht[i]);
 
-	err = genl_register_family_with_ops(&genl_ctrl, &genl_ctrl_ops, 1);
+	err = genl_register_family_with_ops(&genl_ctrl, genl_ctrl_ops);
 	if (err < 0)
 		goto problem;
 
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index f5585611c098..fe6760d328a0 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -1536,8 +1536,7 @@ int __init nfc_genl_init(void)
 {
 	int rc;
 
-	rc = genl_register_family_with_ops(&nfc_genl_family, nfc_genl_ops,
-					   ARRAY_SIZE(nfc_genl_ops));
+	rc = genl_register_family_with_ops(&nfc_genl_family, nfc_genl_ops);
 	if (rc)
 		return rc;
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 91e1c927a465..8ec8b73033e0 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1817,8 +1817,9 @@ static int dp_register_genl(void)
 	for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) {
 		const struct genl_family_and_ops *f = &dp_genl_families[i];
 
-		err = genl_register_family_with_ops(f->family, f->ops,
-						    f->n_ops);
+		f->family->ops = f->ops;
+		f->family->n_ops = f->n_ops;
+		err = genl_register_family(f->family);
 		if (err)
 			goto error;
 		n_registered++;
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 8bcd4985d0fb..9f72a6376362 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -76,9 +76,11 @@ static struct genl_family tipc_genl_family = {
 	.maxattr	= 0,
 };
 
-static struct genl_ops tipc_genl_ops = {
-	.cmd		= TIPC_GENL_CMD,
-	.doit		= handle_cmd,
+static struct genl_ops tipc_genl_ops[] = {
+	{
+		.cmd		= TIPC_GENL_CMD,
+		.doit		= handle_cmd,
+	},
 };
 
 static int tipc_genl_family_registered;
@@ -87,8 +89,7 @@ int tipc_netlink_start(void)
 {
 	int res;
 
-	res = genl_register_family_with_ops(&tipc_genl_family,
-		&tipc_genl_ops, 1);
+	res = genl_register_family_with_ops(&tipc_genl_family, tipc_genl_ops);
 	if (res) {
 		pr_err("Failed to register netlink interface\n");
 		return res;
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
index 47170c9495f1..6328afe90319 100644
--- a/net/wimax/stack.c
+++ b/net/wimax/stack.c
@@ -597,8 +597,8 @@ int __init wimax_subsys_init(void)
 
 	snprintf(wimax_gnl_family.name, sizeof(wimax_gnl_family.name),
 		 "WiMAX");
-	result = genl_register_family_with_ops(&wimax_gnl_family, wimax_gnl_ops,
-					       ARRAY_SIZE(wimax_gnl_ops));
+	result = genl_register_family_with_ops(&wimax_gnl_family,
+					       wimax_gnl_ops);
 	if (unlikely(result < 0)) {
 		printk(KERN_ERR "cannot register generic netlink family: %d\n",
 		       result);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 58c43c8e149f..1b6c5dd4dccf 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -11329,8 +11329,7 @@ int nl80211_init(void)
 {
 	int err;
 
-	err = genl_register_family_with_ops(&nl80211_fam,
-		nl80211_ops, ARRAY_SIZE(nl80211_ops));
+	err = genl_register_family_with_ops(&nl80211_fam, nl80211_ops);
 	if (err)
 		return err;
 
-- 
cgit v1.3


From 2ecf7536b2787580616d23b6507005d930975ca0 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 19 Nov 2013 15:19:33 +0100
Subject: quota/genetlink: use proper genetlink multicast APIs

The quota code is abusing the genetlink API and is using
its family ID as the multicast group ID, which is invalid
and may belong to somebody else (and likely will.)

Make the quota code use the correct API, but since this
is already used as-is by userspace, reserve a family ID
for this code and also reserve that group ID to not break
userspace assumptions.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/quota/netlink.c             | 17 +++++++++++++++--
 include/uapi/linux/genetlink.h |  1 +
 net/netlink/genetlink.c        | 10 ++++++++--
 3 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 16e8abb7709b..aa22fe03b76c 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -11,13 +11,23 @@
 
 /* Netlink family structure for quota */
 static struct genl_family quota_genl_family = {
-	.id = GENL_ID_GENERATE,
+	/*
+	 * Needed due to multicast group ID abuse - old code assumed
+	 * the family ID was also a valid multicast group ID (which
+	 * isn't true) and userspace might thus rely on it. Assign a
+	 * static ID for this group to make dealing with that easier.
+	 */
+	.id = GENL_ID_VFS_DQUOT,
 	.hdrsize = 0,
 	.name = "VFS_DQUOT",
 	.version = 1,
 	.maxattr = QUOTA_NL_A_MAX,
 };
 
+static struct genl_multicast_group quota_mcgrp = {
+	.name = "events",
+};
+
 /**
  * quota_send_warning - Send warning to userspace about exceeded quota
  * @type: The quota type: USRQQUOTA, GRPQUOTA,...
@@ -78,7 +88,7 @@ void quota_send_warning(struct kqid qid, dev_t dev,
 		goto attr_err_out;
 	genlmsg_end(skb, msg_head);
 
-	genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
+	genlmsg_multicast(skb, 0, quota_mcgrp.id, GFP_NOFS);
 	return;
 attr_err_out:
 	printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
@@ -92,6 +102,9 @@ static int __init quota_init(void)
 	if (genl_register_family(&quota_genl_family) != 0)
 		printk(KERN_ERR
 		       "VFS: Failed to create quota netlink interface.\n");
+	if (genl_register_mc_group(&quota_genl_family, &quota_mcgrp))
+		printk(KERN_ERR
+		       "VFS: Failed to register quota mcast group.\n");
 	return 0;
 };
 
diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h
index c880a417d8a9..1af72d8228e0 100644
--- a/include/uapi/linux/genetlink.h
+++ b/include/uapi/linux/genetlink.h
@@ -27,6 +27,7 @@ struct genlmsghdr {
  */
 #define GENL_ID_GENERATE	0
 #define GENL_ID_CTRL		NLMSG_MIN_TYPE
+#define GENL_ID_VFS_DQUOT	(NLMSG_MIN_TYPE + 1)
 
 /**************************************************************************
  * Controller
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 353909d46dda..bee91a7527a5 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -69,8 +69,11 @@ static struct list_head family_ht[GENL_FAM_TAB_SIZE];
  * abuses the API and thinks it can statically use group 1.
  * That group will typically conflict with other groups that
  * any proper users use.
+ * Bit 17 is marked as already used since the VFS quota code
+ * also abused this API and relied on family == group ID, we
+ * cater to that by giving it a static family and group ID.
  */
-static unsigned long mc_group_start = 0x3;
+static unsigned long mc_group_start = 0x3 | BIT(GENL_ID_VFS_DQUOT);
 static unsigned long *mc_groups = &mc_group_start;
 static unsigned long mc_groups_longs = 1;
 
@@ -130,7 +133,8 @@ static u16 genl_generate_id(void)
 	int i;
 
 	for (i = 0; i <= GENL_MAX_ID - GENL_MIN_ID; i++) {
-		if (!genl_family_find_byid(id_gen_idx))
+		if (id_gen_idx != GENL_ID_VFS_DQUOT &&
+		    !genl_family_find_byid(id_gen_idx))
 			return id_gen_idx;
 		if (++id_gen_idx > GENL_MAX_ID)
 			id_gen_idx = GENL_MIN_ID;
@@ -169,6 +173,8 @@ int genl_register_mc_group(struct genl_family *family,
 		id = GENL_ID_CTRL;
 	else if (strcmp(family->name, "NET_DM") == 0)
 		id = 1;
+	else if (strcmp(family->name, "VFS_DQUOT") == 0)
+		id = GENL_ID_VFS_DQUOT;
 	else
 		id = find_first_zero_bit(mc_groups,
 					 mc_groups_longs * BITS_PER_LONG);
-- 
cgit v1.3


From 68eb55031da7c967d954e5f9415cd05f4abdb692 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 19 Nov 2013 15:19:38 +0100
Subject: genetlink: pass family to functions using groups

This doesn't really change anything, but prepares for the
next patch that will change the APIs to pass the group ID
within the family, rather than the global group ID.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/acpi/event.c            |  3 +-
 drivers/net/team/team.c         |  5 +--
 drivers/scsi/pmcraid.c          |  3 +-
 drivers/thermal/thermal_core.c  |  3 +-
 fs/quota/netlink.c              |  2 +-
 include/linux/genl_magic_func.h |  3 +-
 include/net/genetlink.h         | 22 +++++++++----
 net/core/drop_monitor.c         |  3 +-
 net/hsr/hsr_netlink.c           |  6 ++--
 net/ieee802154/netlink.c        |  2 +-
 net/netlink/genetlink.c         | 12 ++++---
 net/nfc/netlink.c               | 39 +++++++++++++++--------
 net/openvswitch/datapath.c      | 35 +++++++++++++--------
 net/openvswitch/datapath.h      |  1 +
 net/openvswitch/dp_notify.c     |  5 +--
 net/wimax/op-msg.c              |  3 +-
 net/wimax/stack.c               |  3 +-
 net/wireless/nl80211.c          | 70 ++++++++++++++++++++---------------------
 18 files changed, 133 insertions(+), 87 deletions(-)

(limited to 'fs')

diff --git a/drivers/acpi/event.c b/drivers/acpi/event.c
index 8247fcdde079..68a8755202ec 100644
--- a/drivers/acpi/event.c
+++ b/drivers/acpi/event.c
@@ -146,7 +146,8 @@ int acpi_bus_generate_netlink_event(const char *device_class,
 		return result;
 	}
 
-	genlmsg_multicast(skb, 0, acpi_event_mcgrp.id, GFP_ATOMIC);
+	genlmsg_multicast(&acpi_event_genl_family,
+			  skb, 0, acpi_event_mcgrp.id, GFP_ATOMIC);
 	return 0;
 }
 
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index f55758b0840e..2721e29935a6 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -2677,8 +2677,9 @@ static struct genl_multicast_group team_change_event_mcgrp = {
 static int team_nl_send_multicast(struct sk_buff *skb,
 				  struct team *team, u32 portid)
 {
-	return genlmsg_multicast_netns(dev_net(team->dev), skb, 0,
-				       team_change_event_mcgrp.id, GFP_KERNEL);
+	return genlmsg_multicast_netns(&team_nl_family, dev_net(team->dev),
+				       skb, 0, team_change_event_mcgrp.id,
+				       GFP_KERNEL);
 }
 
 static int team_nl_send_event_options_get(struct team *team,
diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c
index 1eb7b0280a45..2775441111ff 100644
--- a/drivers/scsi/pmcraid.c
+++ b/drivers/scsi/pmcraid.c
@@ -1512,7 +1512,8 @@ static int pmcraid_notify_aen(
 	}
 
 	result =
-		genlmsg_multicast(skb, 0, pmcraid_event_family.id, GFP_ATOMIC);
+		genlmsg_multicast(&pmcraid_event_family, skb, 0,
+				  pmcraid_event_family.id, GFP_ATOMIC);
 
 	/* If there are no listeners, genlmsg_multicast may return non-zero
 	 * value.
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 4962a6aaf295..2570a944fffc 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1675,7 +1675,8 @@ int thermal_generate_netlink_event(struct thermal_zone_device *tz,
 		return result;
 	}
 
-	result = genlmsg_multicast(skb, 0, thermal_event_mcgrp.id, GFP_ATOMIC);
+	result = genlmsg_multicast(&thermal_event_genl_family, skb, 0,
+				   thermal_event_mcgrp.id, GFP_ATOMIC);
 	if (result)
 		dev_err(&tz->device, "Failed to send netlink event:%d", result);
 
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index aa22fe03b76c..a5b5eddf6603 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -88,7 +88,7 @@ void quota_send_warning(struct kqid qid, dev_t dev,
 		goto attr_err_out;
 	genlmsg_end(skb, msg_head);
 
-	genlmsg_multicast(skb, 0, quota_mcgrp.id, GFP_NOFS);
+	genlmsg_multicast(&quota_genl_family, skb, 0, quota_mcgrp.id, GFP_NOFS);
 	return;
 attr_err_out:
 	printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h
index 47086030ab31..5b9b8ae6748b 100644
--- a/include/linux/genl_magic_func.h
+++ b/include/linux/genl_magic_func.h
@@ -286,7 +286,8 @@ static int CONCAT_(GENL_MAGIC_FAMILY, _genl_multicast_ ## group)(	\
 		CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group).id;	\
 	if (!group_id)							\
 		return -EINVAL;						\
-	return genlmsg_multicast(skb, 0, group_id, flags);		\
+	return genlmsg_multicast(&ZZZ_genl_family, skb, 0,		\
+				 group_id, flags);			\
 }
 
 #include GENL_MAGIC_INCLUDE_FILE
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 11ac77f6180c..60aef0df386b 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -166,7 +166,8 @@ static inline int _genl_register_family_with_ops(struct genl_family *family,
 int genl_unregister_family(struct genl_family *family);
 int genl_register_mc_group(struct genl_family *family,
 			   struct genl_multicast_group *grp);
-void genl_notify(struct sk_buff *skb, struct net *net, u32 portid,
+void genl_notify(struct genl_family *family,
+		 struct sk_buff *skb, struct net *net, u32 portid,
 		 u32 group, struct nlmsghdr *nlh, gfp_t flags);
 
 void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
@@ -246,13 +247,15 @@ static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr)
 
 /**
  * genlmsg_multicast_netns - multicast a netlink message to a specific netns
+ * @family: the generic netlink family
  * @net: the net namespace
  * @skb: netlink message as socket buffer
  * @portid: own netlink portid to avoid sending to yourself
  * @group: multicast group id
  * @flags: allocation flags
  */
-static inline int genlmsg_multicast_netns(struct net *net, struct sk_buff *skb,
+static inline int genlmsg_multicast_netns(struct genl_family *family,
+					  struct net *net, struct sk_buff *skb,
 					  u32 portid, unsigned int group, gfp_t flags)
 {
 	return nlmsg_multicast(net->genl_sock, skb, portid, group, flags);
@@ -260,19 +263,23 @@ static inline int genlmsg_multicast_netns(struct net *net, struct sk_buff *skb,
 
 /**
  * genlmsg_multicast - multicast a netlink message to the default netns
+ * @family: the generic netlink family
  * @skb: netlink message as socket buffer
  * @portid: own netlink portid to avoid sending to yourself
  * @group: multicast group id
  * @flags: allocation flags
  */
-static inline int genlmsg_multicast(struct sk_buff *skb, u32 portid,
+static inline int genlmsg_multicast(struct genl_family *family,
+				    struct sk_buff *skb, u32 portid,
 				    unsigned int group, gfp_t flags)
 {
-	return genlmsg_multicast_netns(&init_net, skb, portid, group, flags);
+	return genlmsg_multicast_netns(family, &init_net, skb,
+				       portid, group, flags);
 }
 
 /**
  * genlmsg_multicast_allns - multicast a netlink message to all net namespaces
+ * @family: the generic netlink family
  * @skb: netlink message as socket buffer
  * @portid: own netlink portid to avoid sending to yourself
  * @group: multicast group id
@@ -280,7 +287,8 @@ static inline int genlmsg_multicast(struct sk_buff *skb, u32 portid,
  *
  * This function must hold the RTNL or rcu_read_lock().
  */
-int genlmsg_multicast_allns(struct sk_buff *skb, u32 portid,
+int genlmsg_multicast_allns(struct genl_family *family,
+			    struct sk_buff *skb, u32 portid,
 			    unsigned int group, gfp_t flags);
 
 /**
@@ -353,6 +361,7 @@ static inline struct sk_buff *genlmsg_new(size_t payload, gfp_t flags)
 
 /**
  * genl_set_err - report error to genetlink broadcast listeners
+ * @family: the generic netlink family
  * @net: the network namespace to report the error to
  * @portid: the PORTID of a process that we want to skip (if any)
  * @group: the broadcast group that will notice the error
@@ -361,7 +370,8 @@ static inline struct sk_buff *genlmsg_new(size_t payload, gfp_t flags)
  * This function returns the number of broadcast listeners that have set the
  * NETLINK_RECV_NO_ENOBUFS socket option.
  */
-static inline int genl_set_err(struct net *net, u32 portid, u32 group, int code)
+static inline int genl_set_err(struct genl_family *family, struct net *net,
+			       u32 portid, u32 group, int code)
 {
 	return netlink_set_err(net->genl_sock, portid, group, code);
 }
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 46ee488c0015..1eab1dc48821 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -120,7 +120,8 @@ static void send_dm_alert(struct work_struct *work)
 	skb = reset_per_cpu_data(data);
 
 	if (skb)
-		genlmsg_multicast(skb, 0, dm_mcgrp.id, GFP_KERNEL);
+		genlmsg_multicast(&net_drop_monitor_family, skb, 0,
+				  dm_mcgrp.id, GFP_KERNEL);
 }
 
 /*
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index 908e335dcdc9..0009416c08c2 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -129,7 +129,8 @@ void hsr_nl_ringerror(struct hsr_priv *hsr_priv, unsigned char addr[ETH_ALEN],
 		goto nla_put_failure;
 
 	genlmsg_end(skb, msg_head);
-	genlmsg_multicast(skb, 0, hsr_network_genl_mcgrp.id, GFP_ATOMIC);
+	genlmsg_multicast(&hsr_genl_family, skb, 0,
+			  hsr_network_genl_mcgrp.id, GFP_ATOMIC);
 
 	return;
 
@@ -163,7 +164,8 @@ void hsr_nl_nodedown(struct hsr_priv *hsr_priv, unsigned char addr[ETH_ALEN])
 		goto nla_put_failure;
 
 	genlmsg_end(skb, msg_head);
-	genlmsg_multicast(skb, 0, hsr_network_genl_mcgrp.id, GFP_ATOMIC);
+	genlmsg_multicast(&hsr_genl_family, skb, 0,
+			  hsr_network_genl_mcgrp.id, GFP_ATOMIC);
 
 	return;
 
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
index 1a81709a4717..5172f467a383 100644
--- a/net/ieee802154/netlink.c
+++ b/net/ieee802154/netlink.c
@@ -70,7 +70,7 @@ int ieee802154_nl_mcast(struct sk_buff *msg, unsigned int group)
 	if (genlmsg_end(msg, hdr) < 0)
 		goto out;
 
-	return genlmsg_multicast(msg, 0, group, GFP_ATOMIC);
+	return genlmsg_multicast(&nl802154_family, msg, 0, group, GFP_ATOMIC);
 out:
 	nlmsg_free(msg);
 	return -ENOBUFS;
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index e9ef640200b4..36e3a86cacf6 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -883,11 +883,12 @@ static int genl_ctrl_event(int event, struct genl_family *family,
 		return PTR_ERR(msg);
 
 	if (!family->netnsok) {
-		genlmsg_multicast_netns(&init_net, msg, 0,
+		genlmsg_multicast_netns(&genl_ctrl, &init_net, msg, 0,
 					GENL_ID_CTRL, GFP_KERNEL);
 	} else {
 		rcu_read_lock();
-		genlmsg_multicast_allns(msg, 0, GENL_ID_CTRL, GFP_ATOMIC);
+		genlmsg_multicast_allns(&genl_ctrl, msg, 0,
+					GENL_ID_CTRL, GFP_ATOMIC);
 		rcu_read_unlock();
 	}
 
@@ -993,14 +994,15 @@ static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group,
 	return err;
 }
 
-int genlmsg_multicast_allns(struct sk_buff *skb, u32 portid, unsigned int group,
-			    gfp_t flags)
+int genlmsg_multicast_allns(struct genl_family *family, struct sk_buff *skb,
+			    u32 portid, unsigned int group, gfp_t flags)
 {
 	return genlmsg_mcast(skb, portid, group, flags);
 }
 EXPORT_SYMBOL(genlmsg_multicast_allns);
 
-void genl_notify(struct sk_buff *skb, struct net *net, u32 portid, u32 group,
+void genl_notify(struct genl_family *family,
+		 struct sk_buff *skb, struct net *net, u32 portid, u32 group,
 		 struct nlmsghdr *nlh, gfp_t flags)
 {
 	struct sock *sk = net->genl_sock;
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index fe6760d328a0..3092df313fb1 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -194,7 +194,8 @@ int nfc_genl_targets_found(struct nfc_dev *dev)
 
 	genlmsg_end(msg, hdr);
 
-	return genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_ATOMIC);
+	return genlmsg_multicast(&nfc_genl_family, msg, 0,
+				 nfc_genl_event_mcgrp.id, GFP_ATOMIC);
 
 nla_put_failure:
 	genlmsg_cancel(msg, hdr);
@@ -223,7 +224,8 @@ int nfc_genl_target_lost(struct nfc_dev *dev, u32 target_idx)
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_KERNEL);
+	genlmsg_multicast(&nfc_genl_family, msg, 0,
+			  nfc_genl_event_mcgrp.id, GFP_KERNEL);
 
 	return 0;
 
@@ -255,7 +257,8 @@ int nfc_genl_tm_activated(struct nfc_dev *dev, u32 protocol)
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_KERNEL);
+	genlmsg_multicast(&nfc_genl_family, msg, 0,
+			  nfc_genl_event_mcgrp.id, GFP_KERNEL);
 
 	return 0;
 
@@ -285,7 +288,8 @@ int nfc_genl_tm_deactivated(struct nfc_dev *dev)
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_KERNEL);
+	genlmsg_multicast(&nfc_genl_family, msg, 0,
+			  nfc_genl_event_mcgrp.id, GFP_KERNEL);
 
 	return 0;
 
@@ -318,7 +322,8 @@ int nfc_genl_device_added(struct nfc_dev *dev)
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_KERNEL);
+	genlmsg_multicast(&nfc_genl_family, msg, 0,
+			  nfc_genl_event_mcgrp.id, GFP_KERNEL);
 
 	return 0;
 
@@ -348,7 +353,8 @@ int nfc_genl_device_removed(struct nfc_dev *dev)
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_KERNEL);
+	genlmsg_multicast(&nfc_genl_family, msg, 0,
+			  nfc_genl_event_mcgrp.id, GFP_KERNEL);
 
 	return 0;
 
@@ -414,7 +420,8 @@ int nfc_genl_llc_send_sdres(struct nfc_dev *dev, struct hlist_head *sdres_list)
 
 	genlmsg_end(msg, hdr);
 
-	return genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_ATOMIC);
+	return genlmsg_multicast(&nfc_genl_family, msg, 0,
+				 nfc_genl_event_mcgrp.id, GFP_ATOMIC);
 
 nla_put_failure:
 	genlmsg_cancel(msg, hdr);
@@ -448,7 +455,8 @@ int nfc_genl_se_added(struct nfc_dev *dev, u32 se_idx, u16 type)
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_KERNEL);
+	genlmsg_multicast(&nfc_genl_family, msg, 0,
+			  nfc_genl_event_mcgrp.id, GFP_KERNEL);
 
 	return 0;
 
@@ -479,7 +487,8 @@ int nfc_genl_se_removed(struct nfc_dev *dev, u32 se_idx)
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_KERNEL);
+	genlmsg_multicast(&nfc_genl_family, msg, 0,
+			  nfc_genl_event_mcgrp.id, GFP_KERNEL);
 
 	return 0;
 
@@ -600,7 +609,8 @@ int nfc_genl_dep_link_up_event(struct nfc_dev *dev, u32 target_idx,
 
 	dev->dep_link_up = true;
 
-	genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_ATOMIC);
+	genlmsg_multicast(&nfc_genl_family, msg, 0,
+			  nfc_genl_event_mcgrp.id, GFP_ATOMIC);
 
 	return 0;
 
@@ -632,7 +642,8 @@ int nfc_genl_dep_link_down_event(struct nfc_dev *dev)
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_ATOMIC);
+	genlmsg_multicast(&nfc_genl_family, msg, 0,
+			  nfc_genl_event_mcgrp.id, GFP_ATOMIC);
 
 	return 0;
 
@@ -1137,7 +1148,8 @@ int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_KERNEL);
+	genlmsg_multicast(&nfc_genl_family, msg, 0,
+			  nfc_genl_event_mcgrp.id, GFP_KERNEL);
 
 	return 0;
 
@@ -1308,7 +1320,8 @@ static void se_io_cb(void *context, u8 *apdu, size_t apdu_len, int err)
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_KERNEL);
+	genlmsg_multicast(&nfc_genl_family, msg, 0,
+			  nfc_genl_event_mcgrp.id, GFP_KERNEL);
 
 	kfree(ctx);
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 3e2bb15fd717..5c19846b1d2a 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -61,10 +61,11 @@
 
 int ovs_net_id __read_mostly;
 
-static void ovs_notify(struct sk_buff *skb, struct genl_info *info,
+static void ovs_notify(struct genl_family *family,
+		       struct sk_buff *skb, struct genl_info *info,
 		       struct genl_multicast_group *grp)
 {
-	genl_notify(skb, genl_info_net(info), info->snd_portid,
+	genl_notify(family, skb, genl_info_net(info), info->snd_portid,
 		    grp->id, info->nlhdr, GFP_KERNEL);
 }
 
@@ -877,9 +878,10 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 	ovs_unlock();
 
 	if (!IS_ERR(reply))
-		ovs_notify(reply, info, &ovs_dp_flow_multicast_group);
+		ovs_notify(&dp_flow_genl_family, reply, info,
+			   &ovs_dp_flow_multicast_group);
 	else
-		genl_set_err(sock_net(skb->sk), 0,
+		genl_set_err(&dp_flow_genl_family, sock_net(skb->sk), 0,
 			     ovs_dp_flow_multicast_group.id, PTR_ERR(reply));
 	return 0;
 
@@ -990,7 +992,8 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
 	ovs_flow_free(flow, true);
 	ovs_unlock();
 
-	ovs_notify(reply, info, &ovs_dp_flow_multicast_group);
+	ovs_notify(&dp_flow_genl_family, reply, info,
+		   &ovs_dp_flow_multicast_group);
 	return 0;
 unlock:
 	ovs_unlock();
@@ -1237,7 +1240,8 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
 
 	ovs_unlock();
 
-	ovs_notify(reply, info, &ovs_dp_datapath_multicast_group);
+	ovs_notify(&dp_datapath_genl_family, reply, info,
+		   &ovs_dp_datapath_multicast_group);
 	return 0;
 
 err_destroy_local_port:
@@ -1302,7 +1306,8 @@ static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info)
 	__dp_destroy(dp);
 	ovs_unlock();
 
-	ovs_notify(reply, info, &ovs_dp_datapath_multicast_group);
+	ovs_notify(&dp_datapath_genl_family, reply, info,
+		   &ovs_dp_datapath_multicast_group);
 
 	return 0;
 unlock:
@@ -1326,14 +1331,15 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
 				      info->snd_seq, OVS_DP_CMD_NEW);
 	if (IS_ERR(reply)) {
 		err = PTR_ERR(reply);
-		genl_set_err(sock_net(skb->sk), 0,
+		genl_set_err(&dp_datapath_genl_family, sock_net(skb->sk), 0,
 			     ovs_dp_datapath_multicast_group.id, err);
 		err = 0;
 		goto unlock;
 	}
 
 	ovs_unlock();
-	ovs_notify(reply, info, &ovs_dp_datapath_multicast_group);
+	ovs_notify(&dp_datapath_genl_family, reply, info,
+		   &ovs_dp_datapath_multicast_group);
 
 	return 0;
 unlock:
@@ -1425,7 +1431,7 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
 	[OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED },
 };
 
-static struct genl_family dp_vport_genl_family = {
+struct genl_family dp_vport_genl_family = {
 	.id = GENL_ID_GENERATE,
 	.hdrsize = sizeof(struct ovs_header),
 	.name = OVS_VPORT_FAMILY,
@@ -1595,7 +1601,8 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
 		goto exit_unlock;
 	}
 
-	ovs_notify(reply, info, &ovs_dp_vport_multicast_group);
+	ovs_notify(&dp_vport_genl_family, reply, info,
+		   &ovs_dp_vport_multicast_group);
 
 exit_unlock:
 	ovs_unlock();
@@ -1642,7 +1649,8 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
 	BUG_ON(err < 0);
 
 	ovs_unlock();
-	ovs_notify(reply, info, &ovs_dp_vport_multicast_group);
+	ovs_notify(&dp_vport_genl_family, reply, info,
+		   &ovs_dp_vport_multicast_group);
 	return 0;
 
 exit_free:
@@ -1679,7 +1687,8 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
 	err = 0;
 	ovs_dp_detach_port(vport);
 
-	ovs_notify(reply, info, &ovs_dp_vport_multicast_group);
+	ovs_notify(&dp_vport_genl_family, reply, info,
+		   &ovs_dp_vport_multicast_group);
 
 exit_unlock:
 	ovs_unlock();
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index d3d14a58aa91..4067ea41be28 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -177,6 +177,7 @@ static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_n
 }
 
 extern struct notifier_block ovs_dp_device_notifier;
+extern struct genl_family dp_vport_genl_family;
 extern struct genl_multicast_group ovs_dp_vport_multicast_group;
 
 void ovs_dp_process_received_packet(struct vport *, struct sk_buff *);
diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c
index 3d55ead6095c..f4b66c84ea0b 100644
--- a/net/openvswitch/dp_notify.c
+++ b/net/openvswitch/dp_notify.c
@@ -34,13 +34,14 @@ static void dp_detach_port_notify(struct vport *vport)
 					  OVS_VPORT_CMD_DEL);
 	ovs_dp_detach_port(vport);
 	if (IS_ERR(notify)) {
-		genl_set_err(ovs_dp_get_net(dp), 0,
+		genl_set_err(&dp_vport_genl_family, ovs_dp_get_net(dp), 0,
 			     ovs_dp_vport_multicast_group.id,
 			     PTR_ERR(notify));
 		return;
 	}
 
-	genlmsg_multicast_netns(ovs_dp_get_net(dp), notify, 0,
+	genlmsg_multicast_netns(&dp_vport_genl_family,
+				ovs_dp_get_net(dp), notify, 0,
 				ovs_dp_vport_multicast_group.id,
 				GFP_KERNEL);
 }
diff --git a/net/wimax/op-msg.c b/net/wimax/op-msg.c
index ff19cbeaf607..f37dd3c5576d 100644
--- a/net/wimax/op-msg.c
+++ b/net/wimax/op-msg.c
@@ -279,7 +279,8 @@ int wimax_msg_send(struct wimax_dev *wimax_dev, struct sk_buff *skb)
 
 	d_printf(1, dev, "CTX: wimax msg, %zu bytes\n", size);
 	d_dump(2, dev, msg, size);
-	genlmsg_multicast(skb, 0, wimax_gnl_mcg.id, GFP_KERNEL);
+	genlmsg_multicast(&wimax_gnl_family, skb, 0,
+			  wimax_gnl_mcg.id, GFP_KERNEL);
 	d_printf(1, dev, "CTX: genl multicast done\n");
 	return 0;
 }
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
index 6328afe90319..18888748e699 100644
--- a/net/wimax/stack.c
+++ b/net/wimax/stack.c
@@ -177,7 +177,8 @@ int wimax_gnl_re_state_change_send(
 		goto out;
 	}
 	genlmsg_end(report_skb, header);
-	genlmsg_multicast(report_skb, 0, wimax_gnl_mcg.id, GFP_KERNEL);
+	genlmsg_multicast(&wimax_gnl_family, report_skb, 0,
+			  wimax_gnl_mcg.id, GFP_KERNEL);
 out:
 	d_fnend(3, dev, "(wimax_dev %p report_skb %p) = %d\n",
 		wimax_dev, report_skb, result);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 1b6c5dd4dccf..f20edfd2e1f0 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -6868,7 +6868,7 @@ void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp)
 
 	nla_nest_end(skb, data);
 	genlmsg_end(skb, hdr);
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), skb, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), skb, 0,
 				nl80211_testmode_mcgrp.id, gfp);
 }
 EXPORT_SYMBOL(cfg80211_testmode_event);
@@ -9597,7 +9597,7 @@ void nl80211_notify_dev_rename(struct cfg80211_registered_device *rdev)
 		return;
 	}
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_config_mcgrp.id, GFP_KERNEL);
 }
 
@@ -9707,7 +9707,7 @@ void nl80211_send_scan_start(struct cfg80211_registered_device *rdev,
 		return;
 	}
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_scan_mcgrp.id, GFP_KERNEL);
 }
 
@@ -9726,7 +9726,7 @@ void nl80211_send_scan_done(struct cfg80211_registered_device *rdev,
 		return;
 	}
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_scan_mcgrp.id, GFP_KERNEL);
 }
 
@@ -9745,7 +9745,7 @@ void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev,
 		return;
 	}
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_scan_mcgrp.id, GFP_KERNEL);
 }
 
@@ -9764,7 +9764,7 @@ void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev,
 		return;
 	}
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_scan_mcgrp.id, GFP_KERNEL);
 }
 
@@ -9782,7 +9782,7 @@ void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev,
 		return;
 	}
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_scan_mcgrp.id, GFP_KERNEL);
 }
 
@@ -9837,8 +9837,8 @@ void nl80211_send_reg_change_event(struct regulatory_request *request)
 	genlmsg_end(msg, hdr);
 
 	rcu_read_lock();
-	genlmsg_multicast_allns(msg, 0, nl80211_regulatory_mcgrp.id,
-				GFP_ATOMIC);
+	genlmsg_multicast_allns(&nl80211_fam, msg, 0,
+				nl80211_regulatory_mcgrp.id, GFP_ATOMIC);
 	rcu_read_unlock();
 
 	return;
@@ -9873,7 +9873,7 @@ static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, gfp);
 	return;
 
@@ -9961,7 +9961,7 @@ static void nl80211_send_mlme_timeout(struct cfg80211_registered_device *rdev,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, gfp);
 	return;
 
@@ -10017,7 +10017,7 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, gfp);
 	return;
 
@@ -10056,7 +10056,7 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, gfp);
 	return;
 
@@ -10094,7 +10094,7 @@ void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, GFP_KERNEL);
 	return;
 
@@ -10128,7 +10128,7 @@ void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, gfp);
 	return;
 
@@ -10169,7 +10169,7 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, gfp);
 	return;
 
@@ -10208,7 +10208,7 @@ void nl80211_michael_mic_failure(struct cfg80211_registered_device *rdev,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, gfp);
 	return;
 
@@ -10261,8 +10261,8 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy,
 	genlmsg_end(msg, hdr);
 
 	rcu_read_lock();
-	genlmsg_multicast_allns(msg, 0, nl80211_regulatory_mcgrp.id,
-				GFP_ATOMIC);
+	genlmsg_multicast_allns(&nl80211_fam, msg, 0,
+				nl80211_regulatory_mcgrp.id, GFP_ATOMIC);
 	rcu_read_unlock();
 
 	return;
@@ -10307,7 +10307,7 @@ static void nl80211_send_remain_on_chan_event(
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, gfp);
 	return;
 
@@ -10362,7 +10362,7 @@ void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr,
 		return;
 	}
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, gfp);
 }
 EXPORT_SYMBOL(cfg80211_new_sta);
@@ -10392,7 +10392,7 @@ void cfg80211_del_sta(struct net_device *dev, const u8 *mac_addr, gfp_t gfp)
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, gfp);
 	return;
 
@@ -10428,7 +10428,7 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, gfp);
 	return;
 
@@ -10590,7 +10590,7 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, gfp);
 	return;
 
@@ -10639,7 +10639,7 @@ void cfg80211_cqm_rssi_notify(struct net_device *dev,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, gfp);
 	return;
 
@@ -10684,7 +10684,7 @@ static void nl80211_gtk_rekey_notify(struct cfg80211_registered_device *rdev,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, gfp);
 	return;
 
@@ -10742,7 +10742,7 @@ nl80211_pmksa_candidate_notify(struct cfg80211_registered_device *rdev,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, gfp);
 	return;
 
@@ -10789,7 +10789,7 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, gfp);
 	return;
 
@@ -10866,7 +10866,7 @@ void cfg80211_cqm_txe_notify(struct net_device *dev,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, gfp);
 	return;
 
@@ -10915,7 +10915,7 @@ nl80211_radar_notify(struct cfg80211_registered_device *rdev,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, gfp);
 	return;
 
@@ -10962,7 +10962,7 @@ void cfg80211_cqm_pktloss_notify(struct net_device *dev,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, gfp);
 	return;
 
@@ -11002,7 +11002,7 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, gfp);
 	return;
 
@@ -11154,7 +11154,7 @@ void cfg80211_report_wowlan_wakeup(struct wireless_dev *wdev,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, gfp);
 	return;
 
@@ -11196,7 +11196,7 @@ void cfg80211_tdls_oper_request(struct net_device *dev, const u8 *peer,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, gfp);
 	return;
 
@@ -11279,7 +11279,7 @@ void cfg80211_ft_event(struct net_device *netdev,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				nl80211_mlme_mcgrp.id, GFP_KERNEL);
 }
 EXPORT_SYMBOL(cfg80211_ft_event);
-- 
cgit v1.3


From 2a94fe48f32ccf7321450a2cc07f2b724a444e5b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 19 Nov 2013 15:19:39 +0100
Subject: genetlink: make multicast groups const, prevent abuse

Register generic netlink multicast groups as an array with
the family and give them contiguous group IDs. Then instead
of passing the global group ID to the various functions that
send messages, pass the ID relative to the family - for most
families that's just 0 because the only have one group.

This avoids the list_head and ID in each group, adding a new
field for the mcast group ID offset to the family.

At the same time, this allows us to prevent abusing groups
again like the quota and dropmon code did, since we can now
check that a family only uses a group it owns.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/acpi/event.c            |  26 ++--
 drivers/net/team/team.c         |  25 +---
 drivers/thermal/thermal_core.c  |  24 ++--
 fs/quota/netlink.c              |  15 +--
 include/linux/genl_magic_func.h |  49 +++----
 include/net/genetlink.h         |  48 ++++---
 net/core/drop_monitor.c         |  18 +--
 net/hsr/hsr_netlink.c           |  19 +--
 net/ieee802154/ieee802154.h     |   6 +-
 net/ieee802154/netlink.c        |  26 ++--
 net/ieee802154/nl-mac.c         |  22 +---
 net/netlink/genetlink.c         | 278 ++++++++++++++++++++++++----------------
 net/nfc/netlink.c               |  51 +++-----
 net/openvswitch/datapath.c      |  43 +++----
 net/openvswitch/dp_notify.c     |   6 +-
 net/wimax/op-msg.c              |   3 +-
 net/wimax/stack.c               |  21 ++-
 net/wimax/wimax-internal.h      |   1 -
 net/wireless/nl80211.c          | 129 ++++++++-----------
 19 files changed, 377 insertions(+), 433 deletions(-)

(limited to 'fs')

diff --git a/drivers/acpi/event.c b/drivers/acpi/event.c
index 68a8755202ec..aeb5aa6ce068 100644
--- a/drivers/acpi/event.c
+++ b/drivers/acpi/event.c
@@ -78,15 +78,17 @@ enum {
 #define ACPI_GENL_VERSION		0x01
 #define ACPI_GENL_MCAST_GROUP_NAME 	"acpi_mc_group"
 
+static const struct genl_multicast_group acpi_event_mcgrps[] = {
+	{ .name = ACPI_GENL_MCAST_GROUP_NAME, },
+};
+
 static struct genl_family acpi_event_genl_family = {
 	.id = GENL_ID_GENERATE,
 	.name = ACPI_GENL_FAMILY_NAME,
 	.version = ACPI_GENL_VERSION,
 	.maxattr = ACPI_GENL_ATTR_MAX,
-};
-
-static struct genl_multicast_group acpi_event_mcgrp = {
-	.name = ACPI_GENL_MCAST_GROUP_NAME,
+	.mcgrps = acpi_event_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(acpi_event_mcgrps),
 };
 
 int acpi_bus_generate_netlink_event(const char *device_class,
@@ -146,8 +148,7 @@ int acpi_bus_generate_netlink_event(const char *device_class,
 		return result;
 	}
 
-	genlmsg_multicast(&acpi_event_genl_family,
-			  skb, 0, acpi_event_mcgrp.id, GFP_ATOMIC);
+	genlmsg_multicast(&acpi_event_genl_family, skb, 0, 0, GFP_ATOMIC);
 	return 0;
 }
 
@@ -155,18 +156,7 @@ EXPORT_SYMBOL(acpi_bus_generate_netlink_event);
 
 static int acpi_event_genetlink_init(void)
 {
-	int result;
-
-	result = genl_register_family(&acpi_event_genl_family);
-	if (result)
-		return result;
-
-	result = genl_register_mc_group(&acpi_event_genl_family,
-					&acpi_event_mcgrp);
-	if (result)
-		genl_unregister_family(&acpi_event_genl_family);
-
-	return result;
+	return genl_register_family(&acpi_event_genl_family);
 }
 
 #else
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 2721e29935a6..0715de50b3dc 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -2670,16 +2670,15 @@ static const struct genl_ops team_nl_ops[] = {
 	},
 };
 
-static struct genl_multicast_group team_change_event_mcgrp = {
-	.name = TEAM_GENL_CHANGE_EVENT_MC_GRP_NAME,
+static const struct genl_multicast_group team_nl_mcgrps[] = {
+	{ .name = TEAM_GENL_CHANGE_EVENT_MC_GRP_NAME, },
 };
 
 static int team_nl_send_multicast(struct sk_buff *skb,
 				  struct team *team, u32 portid)
 {
 	return genlmsg_multicast_netns(&team_nl_family, dev_net(team->dev),
-				       skb, 0, team_change_event_mcgrp.id,
-				       GFP_KERNEL);
+				       skb, 0, 0, GFP_KERNEL);
 }
 
 static int team_nl_send_event_options_get(struct team *team,
@@ -2698,22 +2697,8 @@ static int team_nl_send_event_port_get(struct team *team,
 
 static int team_nl_init(void)
 {
-	int err;
-
-	err = genl_register_family_with_ops(&team_nl_family, team_nl_ops);
-	if (err)
-		return err;
-
-	err = genl_register_mc_group(&team_nl_family, &team_change_event_mcgrp);
-	if (err)
-		goto err_change_event_grp_reg;
-
-	return 0;
-
-err_change_event_grp_reg:
-	genl_unregister_family(&team_nl_family);
-
-	return err;
+	return genl_register_family_with_ops_groups(&team_nl_family, team_nl_ops,
+						    team_nl_mcgrps);
 }
 
 static void team_nl_fini(void)
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 2570a944fffc..19edd6124ca3 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1606,15 +1606,17 @@ exit:
 EXPORT_SYMBOL_GPL(thermal_zone_get_zone_by_name);
 
 #ifdef CONFIG_NET
+static const struct genl_multicast_group thermal_event_mcgrps[] = {
+	{ .name = THERMAL_GENL_MCAST_GROUP_NAME, },
+};
+
 static struct genl_family thermal_event_genl_family = {
 	.id = GENL_ID_GENERATE,
 	.name = THERMAL_GENL_FAMILY_NAME,
 	.version = THERMAL_GENL_VERSION,
 	.maxattr = THERMAL_GENL_ATTR_MAX,
-};
-
-static struct genl_multicast_group thermal_event_mcgrp = {
-	.name = THERMAL_GENL_MCAST_GROUP_NAME,
+	.mcgrps = thermal_event_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(thermal_event_mcgrps),
 };
 
 int thermal_generate_netlink_event(struct thermal_zone_device *tz,
@@ -1676,7 +1678,7 @@ int thermal_generate_netlink_event(struct thermal_zone_device *tz,
 	}
 
 	result = genlmsg_multicast(&thermal_event_genl_family, skb, 0,
-				   thermal_event_mcgrp.id, GFP_ATOMIC);
+				   0, GFP_ATOMIC);
 	if (result)
 		dev_err(&tz->device, "Failed to send netlink event:%d", result);
 
@@ -1686,17 +1688,7 @@ EXPORT_SYMBOL_GPL(thermal_generate_netlink_event);
 
 static int genetlink_init(void)
 {
-	int result;
-
-	result = genl_register_family(&thermal_event_genl_family);
-	if (result)
-		return result;
-
-	result = genl_register_mc_group(&thermal_event_genl_family,
-					&thermal_event_mcgrp);
-	if (result)
-		genl_unregister_family(&thermal_event_genl_family);
-	return result;
+	return genl_register_family(&thermal_event_genl_family);
 }
 
 static void genetlink_exit(void)
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index a5b5eddf6603..72d29177998e 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -9,6 +9,10 @@
 #include <net/netlink.h>
 #include <net/genetlink.h>
 
+static const struct genl_multicast_group quota_mcgrps[] = {
+	{ .name = "events", },
+};
+
 /* Netlink family structure for quota */
 static struct genl_family quota_genl_family = {
 	/*
@@ -22,10 +26,8 @@ static struct genl_family quota_genl_family = {
 	.name = "VFS_DQUOT",
 	.version = 1,
 	.maxattr = QUOTA_NL_A_MAX,
-};
-
-static struct genl_multicast_group quota_mcgrp = {
-	.name = "events",
+	.mcgrps = quota_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(quota_mcgrps),
 };
 
 /**
@@ -88,7 +90,7 @@ void quota_send_warning(struct kqid qid, dev_t dev,
 		goto attr_err_out;
 	genlmsg_end(skb, msg_head);
 
-	genlmsg_multicast(&quota_genl_family, skb, 0, quota_mcgrp.id, GFP_NOFS);
+	genlmsg_multicast(&quota_genl_family, skb, 0, 0, GFP_NOFS);
 	return;
 attr_err_out:
 	printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
@@ -102,9 +104,6 @@ static int __init quota_init(void)
 	if (genl_register_family(&quota_genl_family) != 0)
 		printk(KERN_ERR
 		       "VFS: Failed to create quota netlink interface.\n");
-	if (genl_register_mc_group(&quota_genl_family, &quota_mcgrp))
-		printk(KERN_ERR
-		       "VFS: Failed to register quota mcast group.\n");
 	return 0;
 };
 
diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h
index 5b9b8ae6748b..c0894dd8827b 100644
--- a/include/linux/genl_magic_func.h
+++ b/include/linux/genl_magic_func.h
@@ -273,49 +273,40 @@ static struct genl_family ZZZ_genl_family __read_mostly = {
  * Magic: define multicast groups
  * Magic: define multicast group registration helper
  */
+#define ZZZ_genl_mcgrps		CONCAT_(GENL_MAGIC_FAMILY, _genl_mcgrps)
+static const struct genl_multicast_group ZZZ_genl_mcgrps[] = {
+#undef GENL_mc_group
+#define GENL_mc_group(group) { .name = #group, },
+#include GENL_MAGIC_INCLUDE_FILE
+};
+
+enum CONCAT_(GENL_MAGIC_FAMILY, group_ids) {
+#undef GENL_mc_group
+#define GENL_mc_group(group) CONCAT_(GENL_MAGIC_FAMILY, _group_ ## group),
+#include GENL_MAGIC_INCLUDE_FILE
+};
+
 #undef GENL_mc_group
 #define GENL_mc_group(group)						\
-static struct genl_multicast_group					\
-CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group) __read_mostly = {		\
-	.name = #group,							\
-};									\
 static int CONCAT_(GENL_MAGIC_FAMILY, _genl_multicast_ ## group)(	\
 	struct sk_buff *skb, gfp_t flags)				\
 {									\
 	unsigned int group_id =						\
-		CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group).id;	\
-	if (!group_id)							\
-		return -EINVAL;						\
+		CONCAT_(GENL_MAGIC_FAMILY, _group_ ## group);		\
 	return genlmsg_multicast(&ZZZ_genl_family, skb, 0,		\
 				 group_id, flags);			\
 }
 
 #include GENL_MAGIC_INCLUDE_FILE
 
-int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void)
-{
-	int err = genl_register_family_with_ops(&ZZZ_genl_family, ZZZ_genl_ops);
-	if (err)
-		return err;
-#undef GENL_mc_group
-#define GENL_mc_group(group)						\
-	err = genl_register_mc_group(&ZZZ_genl_family,			\
-		&CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group));		\
-	if (err)							\
-		goto fail;						\
-	else								\
-		pr_info("%s: mcg %s: %u\n", #group,			\
-			__stringify(GENL_MAGIC_FAMILY),			\
-			CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group).id);
-
-#include GENL_MAGIC_INCLUDE_FILE
-
 #undef GENL_mc_group
 #define GENL_mc_group(group)
-	return 0;
-fail:
-	genl_unregister_family(&ZZZ_genl_family);
-	return err;
+
+int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void)
+{
+	return genl_register_family_with_ops_groups(&ZZZ_genl_family,	\
+						    ZZZ_genl_ops,	\
+						    ZZZ_genl_mcgrps);
 }
 
 void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void)
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 60aef0df386b..ace4abf118d7 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -10,14 +10,9 @@
 /**
  * struct genl_multicast_group - generic netlink multicast group
  * @name: name of the multicast group, names are per-family
- * @id: multicast group ID, assigned by the core, to use with
- *      genlmsg_multicast().
- * @list: list entry for linking
  */
 struct genl_multicast_group {
-	struct list_head	list;		/* private */
 	char			name[GENL_NAMSIZ];
-	u32			id;
 };
 
 struct genl_ops;
@@ -38,7 +33,9 @@ struct genl_info;
  *	undo operations done by pre_doit, for example release locks
  * @attrbuf: buffer to store parsed attributes
  * @family_list: family list
- * @mcast_groups: multicast groups list
+ * @mcgrps: multicast groups used by this family (private)
+ * @n_mcgrps: number of multicast groups (private)
+ * @mcgrp_offset: starting number of multicast group IDs in this family
  * @ops: the operations supported by this family (private)
  * @n_ops: number of operations supported by this family (private)
  */
@@ -58,9 +55,11 @@ struct genl_family {
 					     struct genl_info *info);
 	struct nlattr **	attrbuf;	/* private */
 	const struct genl_ops *	ops;		/* private */
+	const struct genl_multicast_group *mcgrps; /* private */
 	unsigned int		n_ops;		/* private */
+	unsigned int		n_mcgrps;	/* private */
+	unsigned int		mcgrp_offset;	/* private */
 	struct list_head	family_list;	/* private */
-	struct list_head	mcast_groups;	/* private */
 	struct module		*module;
 };
 
@@ -150,22 +149,30 @@ static inline int genl_register_family(struct genl_family *family)
  *
  * Return 0 on success or a negative error code.
  */
-static inline int _genl_register_family_with_ops(struct genl_family *family,
-						 const struct genl_ops *ops,
-						 size_t n_ops)
+static inline int
+_genl_register_family_with_ops_grps(struct genl_family *family,
+				    const struct genl_ops *ops, size_t n_ops,
+				    const struct genl_multicast_group *mcgrps,
+				    size_t n_mcgrps)
 {
 	family->module = THIS_MODULE;
 	family->ops = ops;
 	family->n_ops = n_ops;
+	family->mcgrps = mcgrps;
+	family->n_mcgrps = n_mcgrps;
 	return __genl_register_family(family);
 }
 
-#define genl_register_family_with_ops(family, ops)	\
-	_genl_register_family_with_ops((family), (ops), ARRAY_SIZE(ops))
+#define genl_register_family_with_ops(family, ops)			\
+	_genl_register_family_with_ops_grps((family),			\
+					    (ops), ARRAY_SIZE(ops),	\
+					    NULL, 0)
+#define genl_register_family_with_ops_groups(family, ops, grps)	\
+	_genl_register_family_with_ops_grps((family),			\
+					    (ops), ARRAY_SIZE(ops),	\
+					    (grps), ARRAY_SIZE(grps))
 
 int genl_unregister_family(struct genl_family *family);
-int genl_register_mc_group(struct genl_family *family,
-			   struct genl_multicast_group *grp);
 void genl_notify(struct genl_family *family,
 		 struct sk_buff *skb, struct net *net, u32 portid,
 		 u32 group, struct nlmsghdr *nlh, gfp_t flags);
@@ -251,13 +258,16 @@ static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr)
  * @net: the net namespace
  * @skb: netlink message as socket buffer
  * @portid: own netlink portid to avoid sending to yourself
- * @group: multicast group id
+ * @group: offset of multicast group in groups array
  * @flags: allocation flags
  */
 static inline int genlmsg_multicast_netns(struct genl_family *family,
 					  struct net *net, struct sk_buff *skb,
 					  u32 portid, unsigned int group, gfp_t flags)
 {
+	if (group >= family->n_mcgrps)
+		return -EINVAL;
+	group = family->mcgrp_offset + group;
 	return nlmsg_multicast(net->genl_sock, skb, portid, group, flags);
 }
 
@@ -266,13 +276,16 @@ static inline int genlmsg_multicast_netns(struct genl_family *family,
  * @family: the generic netlink family
  * @skb: netlink message as socket buffer
  * @portid: own netlink portid to avoid sending to yourself
- * @group: multicast group id
+ * @group: offset of multicast group in groups array
  * @flags: allocation flags
  */
 static inline int genlmsg_multicast(struct genl_family *family,
 				    struct sk_buff *skb, u32 portid,
 				    unsigned int group, gfp_t flags)
 {
+	if (group >= family->n_mcgrps)
+		return -EINVAL;
+	group = family->mcgrp_offset + group;
 	return genlmsg_multicast_netns(family, &init_net, skb,
 				       portid, group, flags);
 }
@@ -282,7 +295,7 @@ static inline int genlmsg_multicast(struct genl_family *family,
  * @family: the generic netlink family
  * @skb: netlink message as socket buffer
  * @portid: own netlink portid to avoid sending to yourself
- * @group: multicast group id
+ * @group: offset of multicast group in groups array
  * @flags: allocation flags
  *
  * This function must hold the RTNL or rcu_read_lock().
@@ -365,6 +378,7 @@ static inline struct sk_buff *genlmsg_new(size_t payload, gfp_t flags)
  * @net: the network namespace to report the error to
  * @portid: the PORTID of a process that we want to skip (if any)
  * @group: the broadcast group that will notice the error
+ * 	(this is the offset of the multicast group in the groups array)
  * @code: error code, must be negative (as usual in kernelspace)
  *
  * This function returns the number of broadcast listeners that have set the
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 1eab1dc48821..95897183226e 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -106,8 +106,8 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
 	return skb;
 }
 
-static struct genl_multicast_group dm_mcgrp = {
-	.name = "events",
+static struct genl_multicast_group dropmon_mcgrps[] = {
+	{ .name = "events", },
 };
 
 static void send_dm_alert(struct work_struct *work)
@@ -121,7 +121,7 @@ static void send_dm_alert(struct work_struct *work)
 
 	if (skb)
 		genlmsg_multicast(&net_drop_monitor_family, skb, 0,
-				  dm_mcgrp.id, GFP_KERNEL);
+				  0, GFP_KERNEL);
 }
 
 /*
@@ -369,19 +369,13 @@ static int __init init_net_drop_monitor(void)
 		return -ENOSPC;
 	}
 
-	rc = genl_register_family_with_ops(&net_drop_monitor_family,
-					   dropmon_ops);
+	rc = genl_register_family_with_ops_groups(&net_drop_monitor_family,
+						  dropmon_ops, dropmon_mcgrps);
 	if (rc) {
 		pr_err("Could not create drop monitor netlink family\n");
 		return rc;
 	}
-
-	rc = genl_register_mc_group(&net_drop_monitor_family, &dm_mcgrp);
-	if (rc) {
-		pr_err("Failed to register drop monitor mcast group\n");
-		goto out_unreg;
-	}
-	WARN_ON(dm_mcgrp.id != NET_DM_GRP_ALERT);
+	WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT);
 
 	rc = register_netdevice_notifier(&dropmon_net_notifier);
 	if (rc < 0) {
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index 0009416c08c2..5325af85eea6 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -90,8 +90,8 @@ static struct genl_family hsr_genl_family = {
 	.maxattr = HSR_A_MAX,
 };
 
-static struct genl_multicast_group hsr_network_genl_mcgrp = {
-	.name = "hsr-network",
+static const struct genl_multicast_group hsr_mcgrps[] = {
+	{ .name = "hsr-network", },
 };
 
 
@@ -129,8 +129,7 @@ void hsr_nl_ringerror(struct hsr_priv *hsr_priv, unsigned char addr[ETH_ALEN],
 		goto nla_put_failure;
 
 	genlmsg_end(skb, msg_head);
-	genlmsg_multicast(&hsr_genl_family, skb, 0,
-			  hsr_network_genl_mcgrp.id, GFP_ATOMIC);
+	genlmsg_multicast(&hsr_genl_family, skb, 0, 0, GFP_ATOMIC);
 
 	return;
 
@@ -164,8 +163,7 @@ void hsr_nl_nodedown(struct hsr_priv *hsr_priv, unsigned char addr[ETH_ALEN])
 		goto nla_put_failure;
 
 	genlmsg_end(skb, msg_head);
-	genlmsg_multicast(&hsr_genl_family, skb, 0,
-			  hsr_network_genl_mcgrp.id, GFP_ATOMIC);
+	genlmsg_multicast(&hsr_genl_family, skb, 0, 0, GFP_ATOMIC);
 
 	return;
 
@@ -416,18 +414,13 @@ int __init hsr_netlink_init(void)
 	if (rc)
 		goto fail_rtnl_link_register;
 
-	rc = genl_register_family_with_ops(&hsr_genl_family, hsr_ops);
+	rc = genl_register_family_with_ops_groups(&hsr_genl_family, hsr_ops,
+						  hsr_mcgrps);
 	if (rc)
 		goto fail_genl_register_family;
 
-	rc = genl_register_mc_group(&hsr_genl_family, &hsr_network_genl_mcgrp);
-	if (rc)
-		goto fail_genl_register_mc_group;
-
 	return 0;
 
-fail_genl_register_mc_group:
-	genl_unregister_family(&hsr_genl_family);
 fail_genl_register_family:
 	rtnl_link_unregister(&hsr_link_ops);
 fail_rtnl_link_register:
diff --git a/net/ieee802154/ieee802154.h b/net/ieee802154/ieee802154.h
index 14d5dab4436f..cee4425b9956 100644
--- a/net/ieee802154/ieee802154.h
+++ b/net/ieee802154/ieee802154.h
@@ -54,8 +54,10 @@ int ieee802154_dump_phy(struct sk_buff *skb, struct netlink_callback *cb);
 int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info);
 int ieee802154_del_iface(struct sk_buff *skb, struct genl_info *info);
 
-extern struct genl_multicast_group ieee802154_coord_mcgrp;
-extern struct genl_multicast_group ieee802154_beacon_mcgrp;
+enum ieee802154_mcgrp_ids {
+	IEEE802154_COORD_MCGRP,
+	IEEE802154_BEACON_MCGRP,
+};
 
 int ieee802154_associate_req(struct sk_buff *skb, struct genl_info *info);
 int ieee802154_associate_resp(struct sk_buff *skb, struct genl_info *info);
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
index 5172f467a383..43f1b2bf469f 100644
--- a/net/ieee802154/netlink.c
+++ b/net/ieee802154/netlink.c
@@ -125,25 +125,17 @@ static const struct genl_ops ieee8021154_ops[] = {
 			ieee802154_dump_iface),
 };
 
-int __init ieee802154_nl_init(void)
-{
-	int rc;
-
-	rc = genl_register_family_with_ops(&nl802154_family, ieee8021154_ops);
-	if (rc)
-		return rc;
+static const struct genl_multicast_group ieee802154_mcgrps[] = {
+	[IEEE802154_COORD_MCGRP] = { .name = IEEE802154_MCAST_COORD_NAME, },
+	[IEEE802154_BEACON_MCGRP] = { .name = IEEE802154_MCAST_BEACON_NAME, },
+};
 
-	rc = genl_register_mc_group(&nl802154_family, &ieee802154_coord_mcgrp);
-	if (rc)
-		goto fail;
 
-	rc = genl_register_mc_group(&nl802154_family, &ieee802154_beacon_mcgrp);
-	if (rc)
-		goto fail;
-	return 0;
-fail:
-	genl_unregister_family(&nl802154_family);
-	return rc;
+int __init ieee802154_nl_init(void)
+{
+	return genl_register_family_with_ops_groups(&nl802154_family,
+						    ieee8021154_ops,
+						    ieee802154_mcgrps);
 }
 
 void __exit ieee802154_nl_exit(void)
diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c
index 28d493032132..ba5c1e002f37 100644
--- a/net/ieee802154/nl-mac.c
+++ b/net/ieee802154/nl-mac.c
@@ -39,14 +39,6 @@
 
 #include "ieee802154.h"
 
-struct genl_multicast_group ieee802154_coord_mcgrp = {
-	.name		= IEEE802154_MCAST_COORD_NAME,
-};
-
-struct genl_multicast_group ieee802154_beacon_mcgrp = {
-	.name		= IEEE802154_MCAST_BEACON_NAME,
-};
-
 int ieee802154_nl_assoc_indic(struct net_device *dev,
 		struct ieee802154_addr *addr, u8 cap)
 {
@@ -72,7 +64,7 @@ int ieee802154_nl_assoc_indic(struct net_device *dev,
 	    nla_put_u8(msg, IEEE802154_ATTR_CAPABILITY, cap))
 		goto nla_put_failure;
 
-	return ieee802154_nl_mcast(msg, ieee802154_coord_mcgrp.id);
+	return ieee802154_nl_mcast(msg, IEEE802154_COORD_MCGRP);
 
 nla_put_failure:
 	nlmsg_free(msg);
@@ -98,7 +90,7 @@ int ieee802154_nl_assoc_confirm(struct net_device *dev, u16 short_addr,
 	    nla_put_u16(msg, IEEE802154_ATTR_SHORT_ADDR, short_addr) ||
 	    nla_put_u8(msg, IEEE802154_ATTR_STATUS, status))
 		goto nla_put_failure;
-	return ieee802154_nl_mcast(msg, ieee802154_coord_mcgrp.id);
+	return ieee802154_nl_mcast(msg, IEEE802154_COORD_MCGRP);
 
 nla_put_failure:
 	nlmsg_free(msg);
@@ -133,7 +125,7 @@ int ieee802154_nl_disassoc_indic(struct net_device *dev,
 	}
 	if (nla_put_u8(msg, IEEE802154_ATTR_REASON, reason))
 		goto nla_put_failure;
-	return ieee802154_nl_mcast(msg, ieee802154_coord_mcgrp.id);
+	return ieee802154_nl_mcast(msg, IEEE802154_COORD_MCGRP);
 
 nla_put_failure:
 	nlmsg_free(msg);
@@ -157,7 +149,7 @@ int ieee802154_nl_disassoc_confirm(struct net_device *dev, u8 status)
 		    dev->dev_addr) ||
 	    nla_put_u8(msg, IEEE802154_ATTR_STATUS, status))
 		goto nla_put_failure;
-	return ieee802154_nl_mcast(msg, ieee802154_coord_mcgrp.id);
+	return ieee802154_nl_mcast(msg, IEEE802154_COORD_MCGRP);
 
 nla_put_failure:
 	nlmsg_free(msg);
@@ -183,7 +175,7 @@ int ieee802154_nl_beacon_indic(struct net_device *dev,
 	    nla_put_u16(msg, IEEE802154_ATTR_COORD_SHORT_ADDR, coord_addr) ||
 	    nla_put_u16(msg, IEEE802154_ATTR_COORD_PAN_ID, panid))
 		goto nla_put_failure;
-	return ieee802154_nl_mcast(msg, ieee802154_coord_mcgrp.id);
+	return ieee802154_nl_mcast(msg, IEEE802154_COORD_MCGRP);
 
 nla_put_failure:
 	nlmsg_free(msg);
@@ -214,7 +206,7 @@ int ieee802154_nl_scan_confirm(struct net_device *dev,
 	    (edl &&
 	     nla_put(msg, IEEE802154_ATTR_ED_LIST, 27, edl)))
 		goto nla_put_failure;
-	return ieee802154_nl_mcast(msg, ieee802154_coord_mcgrp.id);
+	return ieee802154_nl_mcast(msg, IEEE802154_COORD_MCGRP);
 
 nla_put_failure:
 	nlmsg_free(msg);
@@ -238,7 +230,7 @@ int ieee802154_nl_start_confirm(struct net_device *dev, u8 status)
 		    dev->dev_addr) ||
 	    nla_put_u8(msg, IEEE802154_ATTR_STATUS, status))
 		goto nla_put_failure;
-	return ieee802154_nl_mcast(msg, ieee802154_coord_mcgrp.id);
+	return ieee802154_nl_mcast(msg, IEEE802154_COORD_MCGRP);
 
 nla_put_failure:
 	nlmsg_free(msg);
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 36e3a86cacf6..7dbc4f732c75 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -69,16 +69,20 @@ static struct list_head family_ht[GENL_FAM_TAB_SIZE];
  * abuses the API and thinks it can statically use group 1.
  * That group will typically conflict with other groups that
  * any proper users use.
+ * Bit 16 is marked as used since it's used for generic netlink
+ * and the code no longer marks pre-reserved IDs as used.
  * Bit 17 is marked as already used since the VFS quota code
  * also abused this API and relied on family == group ID, we
  * cater to that by giving it a static family and group ID.
  */
-static unsigned long mc_group_start = 0x3 | BIT(GENL_ID_VFS_DQUOT);
+static unsigned long mc_group_start = 0x3 | BIT(GENL_ID_CTRL) |
+				      BIT(GENL_ID_VFS_DQUOT);
 static unsigned long *mc_groups = &mc_group_start;
 static unsigned long mc_groups_longs = 1;
 
 static int genl_ctrl_event(int event, struct genl_family *family,
-			   struct genl_multicast_group *grp);
+			   const struct genl_multicast_group *grp,
+			   int grp_id);
 
 static inline unsigned int genl_family_hash(unsigned int id)
 {
@@ -144,66 +148,110 @@ static u16 genl_generate_id(void)
 	return 0;
 }
 
-static struct genl_multicast_group notify_grp;
-
-/**
- * genl_register_mc_group - register a multicast group
- *
- * Registers the specified multicast group and notifies userspace
- * about the new group.
- *
- * Returns 0 on success or a negative error code.
- *
- * @family: The generic netlink family the group shall be registered for.
- * @grp: The group to register, must have a name.
- */
-int genl_register_mc_group(struct genl_family *family,
-			   struct genl_multicast_group *grp)
+static int genl_allocate_reserve_groups(int n_groups, int *first_id)
 {
-	int id;
 	unsigned long *new_groups;
-	int err = 0;
+	int start = 0;
+	int i;
+	int id;
+	bool fits;
+
+	do {
+		if (start == 0)
+			id = find_first_zero_bit(mc_groups,
+						 mc_groups_longs *
+						 BITS_PER_LONG);
+		else
+			id = find_next_zero_bit(mc_groups,
+						mc_groups_longs * BITS_PER_LONG,
+						start);
+
+		fits = true;
+		for (i = id;
+		     i < min_t(int, id + n_groups,
+			       mc_groups_longs * BITS_PER_LONG);
+		     i++) {
+			if (test_bit(i, mc_groups)) {
+				start = i;
+				fits = false;
+				break;
+			}
+		}
 
-	BUG_ON(grp->name[0] == '\0');
-	BUG_ON(memchr(grp->name, '\0', GENL_NAMSIZ) == NULL);
+		if (id >= mc_groups_longs * BITS_PER_LONG) {
+			unsigned long new_longs = mc_groups_longs +
+						  BITS_TO_LONGS(n_groups);
+			size_t nlen = new_longs * sizeof(unsigned long);
+
+			if (mc_groups == &mc_group_start) {
+				new_groups = kzalloc(nlen, GFP_KERNEL);
+				if (!new_groups)
+					return -ENOMEM;
+				mc_groups = new_groups;
+				*mc_groups = mc_group_start;
+			} else {
+				new_groups = krealloc(mc_groups, nlen,
+						      GFP_KERNEL);
+				if (!new_groups)
+					return -ENOMEM;
+				mc_groups = new_groups;
+				for (i = 0; i < BITS_TO_LONGS(n_groups); i++)
+					mc_groups[mc_groups_longs + i] = 0;
+			}
+			mc_groups_longs = new_longs;
+		}
+	} while (!fits);
 
-	genl_lock_all();
+	for (i = id; i < id + n_groups; i++)
+		set_bit(i, mc_groups);
+	*first_id = id;
+	return 0;
+}
+
+static struct genl_family genl_ctrl;
+
+static int genl_validate_assign_mc_groups(struct genl_family *family)
+{
+	int first_id;
+	int n_groups = family->n_mcgrps;
+	int err, i;
+	bool groups_allocated = false;
+
+	if (!n_groups)
+		return 0;
+
+	for (i = 0; i < n_groups; i++) {
+		const struct genl_multicast_group *grp = &family->mcgrps[i];
+
+		if (WARN_ON(grp->name[0] == '\0'))
+			return -EINVAL;
+		if (WARN_ON(memchr(grp->name, '\0', GENL_NAMSIZ) == NULL))
+			return -EINVAL;
+	}
 
 	/* special-case our own group and hacks */
-	if (grp == &notify_grp)
-		id = GENL_ID_CTRL;
-	else if (strcmp(family->name, "NET_DM") == 0)
-		id = 1;
-	else if (strcmp(family->name, "VFS_DQUOT") == 0)
-		id = GENL_ID_VFS_DQUOT;
-	else
-		id = find_first_zero_bit(mc_groups,
-					 mc_groups_longs * BITS_PER_LONG);
-
-
-	if (id >= mc_groups_longs * BITS_PER_LONG) {
-		size_t nlen = (mc_groups_longs + 1) * sizeof(unsigned long);
-
-		if (mc_groups == &mc_group_start) {
-			new_groups = kzalloc(nlen, GFP_KERNEL);
-			if (!new_groups) {
-				err = -ENOMEM;
-				goto out;
-			}
-			mc_groups = new_groups;
-			*mc_groups = mc_group_start;
-		} else {
-			new_groups = krealloc(mc_groups, nlen, GFP_KERNEL);
-			if (!new_groups) {
-				err = -ENOMEM;
-				goto out;
-			}
-			mc_groups = new_groups;
-			mc_groups[mc_groups_longs] = 0;
-		}
-		mc_groups_longs++;
+	if (family == &genl_ctrl) {
+		first_id = GENL_ID_CTRL;
+		BUG_ON(n_groups != 1);
+	} else if (strcmp(family->name, "NET_DM") == 0) {
+		first_id = 1;
+		BUG_ON(n_groups != 1);
+	} else if (strcmp(family->name, "VFS_DQUOT") == 0) {
+		first_id = GENL_ID_VFS_DQUOT;
+		BUG_ON(n_groups != 1);
+	} else {
+		groups_allocated = true;
+		err = genl_allocate_reserve_groups(n_groups, &first_id);
+		if (err)
+			return err;
 	}
 
+	family->mcgrp_offset = first_id;
+
+	/* if still initializing, can't and don't need to to realloc bitmaps */
+	if (!init_net.genl_sock)
+		return 0;
+
 	if (family->netnsok) {
 		struct net *net;
 
@@ -219,9 +267,7 @@ int genl_register_mc_group(struct genl_family *family,
 				 * number of _possible_ groups has been
 				 * increased on some sockets which is ok.
 				 */
-				rcu_read_unlock();
-				netlink_table_ungrab();
-				goto out;
+				break;
 			}
 		}
 		rcu_read_unlock();
@@ -229,46 +275,39 @@ int genl_register_mc_group(struct genl_family *family,
 	} else {
 		err = netlink_change_ngroups(init_net.genl_sock,
 					     mc_groups_longs * BITS_PER_LONG);
-		if (err)
-			goto out;
 	}
 
-	grp->id = id;
-	set_bit(id, mc_groups);
-	list_add_tail(&grp->list, &family->mcast_groups);
+	if (groups_allocated && err) {
+		for (i = 0; i < family->n_mcgrps; i++)
+			clear_bit(family->mcgrp_offset + i, mc_groups);
+	}
 
-	genl_ctrl_event(CTRL_CMD_NEWMCAST_GRP, family, grp);
- out:
-	genl_unlock_all();
 	return err;
 }
-EXPORT_SYMBOL(genl_register_mc_group);
 
-static void __genl_unregister_mc_group(struct genl_family *family,
-				       struct genl_multicast_group *grp)
+static void genl_unregister_mc_groups(struct genl_family *family)
 {
 	struct net *net;
+	int i;
 
 	netlink_table_grab();
 	rcu_read_lock();
-	for_each_net_rcu(net)
-		__netlink_clear_multicast_users(net->genl_sock, grp->id);
+	for_each_net_rcu(net) {
+		for (i = 0; i < family->n_mcgrps; i++)
+			__netlink_clear_multicast_users(
+				net->genl_sock, family->mcgrp_offset + i);
+	}
 	rcu_read_unlock();
 	netlink_table_ungrab();
 
-	if (grp->id != 1)
-		clear_bit(grp->id, mc_groups);
-	list_del(&grp->list);
-	genl_ctrl_event(CTRL_CMD_DELMCAST_GRP, family, grp);
-	grp->id = 0;
-}
-
-static void genl_unregister_mc_groups(struct genl_family *family)
-{
-	struct genl_multicast_group *grp, *tmp;
+	for (i = 0; i < family->n_mcgrps; i++) {
+		int grp_id = family->mcgrp_offset + i;
 
-	list_for_each_entry_safe(grp, tmp, &family->mcast_groups, list)
-		__genl_unregister_mc_group(family, grp);
+		if (grp_id != 1)
+			clear_bit(grp_id, mc_groups);
+		genl_ctrl_event(CTRL_CMD_DELMCAST_GRP, family,
+				&family->mcgrps[i], grp_id);
+	}
 }
 
 static int genl_validate_ops(struct genl_family *family)
@@ -314,7 +353,7 @@ static int genl_validate_ops(struct genl_family *family)
  */
 int __genl_register_family(struct genl_family *family)
 {
-	int err = -EINVAL;
+	int err = -EINVAL, i;
 
 	if (family->id && family->id < GENL_MIN_ID)
 		goto errout;
@@ -326,8 +365,6 @@ int __genl_register_family(struct genl_family *family)
 	if (err)
 		return err;
 
-	INIT_LIST_HEAD(&family->mcast_groups);
-
 	genl_lock_all();
 
 	if (genl_family_find_byname(family->name)) {
@@ -359,10 +396,18 @@ int __genl_register_family(struct genl_family *family)
 	} else
 		family->attrbuf = NULL;
 
+	err = genl_validate_assign_mc_groups(family);
+	if (err)
+		goto errout_locked;
+
 	list_add_tail(&family->family_list, genl_family_chain(family->id));
 	genl_unlock_all();
 
-	genl_ctrl_event(CTRL_CMD_NEWFAMILY, family, NULL);
+	/* send all events */
+	genl_ctrl_event(CTRL_CMD_NEWFAMILY, family, NULL, 0);
+	for (i = 0; i < family->n_mcgrps; i++)
+		genl_ctrl_event(CTRL_CMD_NEWMCAST_GRP, family,
+				&family->mcgrps[i], family->mcgrp_offset + i);
 
 	return 0;
 
@@ -398,7 +443,7 @@ int genl_unregister_family(struct genl_family *family)
 		genl_unlock_all();
 
 		kfree(family->attrbuf);
-		genl_ctrl_event(CTRL_CMD_DELFAMILY, family, NULL);
+		genl_ctrl_event(CTRL_CMD_DELFAMILY, family, NULL, 0);
 		return 0;
 	}
 
@@ -658,23 +703,26 @@ static int ctrl_fill_info(struct genl_family *family, u32 portid, u32 seq,
 		nla_nest_end(skb, nla_ops);
 	}
 
-	if (!list_empty(&family->mcast_groups)) {
-		struct genl_multicast_group *grp;
+	if (family->n_mcgrps) {
 		struct nlattr *nla_grps;
-		int idx = 1;
+		int i;
 
 		nla_grps = nla_nest_start(skb, CTRL_ATTR_MCAST_GROUPS);
 		if (nla_grps == NULL)
 			goto nla_put_failure;
 
-		list_for_each_entry(grp, &family->mcast_groups, list) {
+		for (i = 0; i < family->n_mcgrps; i++) {
 			struct nlattr *nest;
+			const struct genl_multicast_group *grp;
 
-			nest = nla_nest_start(skb, idx++);
+			grp = &family->mcgrps[i];
+
+			nest = nla_nest_start(skb, i + 1);
 			if (nest == NULL)
 				goto nla_put_failure;
 
-			if (nla_put_u32(skb, CTRL_ATTR_MCAST_GRP_ID, grp->id) ||
+			if (nla_put_u32(skb, CTRL_ATTR_MCAST_GRP_ID,
+					family->mcgrp_offset + i) ||
 			    nla_put_string(skb, CTRL_ATTR_MCAST_GRP_NAME,
 					   grp->name))
 				goto nla_put_failure;
@@ -692,9 +740,9 @@ nla_put_failure:
 }
 
 static int ctrl_fill_mcgrp_info(struct genl_family *family,
-				struct genl_multicast_group *grp, u32 portid,
-				u32 seq, u32 flags, struct sk_buff *skb,
-				u8 cmd)
+				const struct genl_multicast_group *grp,
+				int grp_id, u32 portid, u32 seq, u32 flags,
+				struct sk_buff *skb, u8 cmd)
 {
 	void *hdr;
 	struct nlattr *nla_grps;
@@ -716,7 +764,7 @@ static int ctrl_fill_mcgrp_info(struct genl_family *family,
 	if (nest == NULL)
 		goto nla_put_failure;
 
-	if (nla_put_u32(skb, CTRL_ATTR_MCAST_GRP_ID, grp->id) ||
+	if (nla_put_u32(skb, CTRL_ATTR_MCAST_GRP_ID, grp_id) ||
 	    nla_put_string(skb, CTRL_ATTR_MCAST_GRP_NAME,
 			   grp->name))
 		goto nla_put_failure;
@@ -782,9 +830,10 @@ static struct sk_buff *ctrl_build_family_msg(struct genl_family *family,
 	return skb;
 }
 
-static struct sk_buff *ctrl_build_mcgrp_msg(struct genl_family *family,
-					    struct genl_multicast_group *grp,
-					    u32 portid, int seq, u8 cmd)
+static struct sk_buff *
+ctrl_build_mcgrp_msg(struct genl_family *family,
+		     const struct genl_multicast_group *grp,
+		     int grp_id, u32 portid, int seq, u8 cmd)
 {
 	struct sk_buff *skb;
 	int err;
@@ -793,7 +842,8 @@ static struct sk_buff *ctrl_build_mcgrp_msg(struct genl_family *family,
 	if (skb == NULL)
 		return ERR_PTR(-ENOBUFS);
 
-	err = ctrl_fill_mcgrp_info(family, grp, portid, seq, 0, skb, cmd);
+	err = ctrl_fill_mcgrp_info(family, grp, grp_id, portid,
+				   seq, 0, skb, cmd);
 	if (err < 0) {
 		nlmsg_free(skb);
 		return ERR_PTR(err);
@@ -856,7 +906,8 @@ static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info)
 }
 
 static int genl_ctrl_event(int event, struct genl_family *family,
-			   struct genl_multicast_group *grp)
+			   const struct genl_multicast_group *grp,
+			   int grp_id)
 {
 	struct sk_buff *msg;
 
@@ -873,7 +924,7 @@ static int genl_ctrl_event(int event, struct genl_family *family,
 	case CTRL_CMD_NEWMCAST_GRP:
 	case CTRL_CMD_DELMCAST_GRP:
 		BUG_ON(!grp);
-		msg = ctrl_build_mcgrp_msg(family, grp, 0, 0, event);
+		msg = ctrl_build_mcgrp_msg(family, grp, grp_id, 0, 0, event);
 		break;
 	default:
 		return -EINVAL;
@@ -884,11 +935,11 @@ static int genl_ctrl_event(int event, struct genl_family *family,
 
 	if (!family->netnsok) {
 		genlmsg_multicast_netns(&genl_ctrl, &init_net, msg, 0,
-					GENL_ID_CTRL, GFP_KERNEL);
+					0, GFP_KERNEL);
 	} else {
 		rcu_read_lock();
 		genlmsg_multicast_allns(&genl_ctrl, msg, 0,
-					GENL_ID_CTRL, GFP_ATOMIC);
+					0, GFP_ATOMIC);
 		rcu_read_unlock();
 	}
 
@@ -904,8 +955,8 @@ static struct genl_ops genl_ctrl_ops[] = {
 	},
 };
 
-static struct genl_multicast_group notify_grp = {
-	.name		= "notify",
+static struct genl_multicast_group genl_ctrl_groups[] = {
+	{ .name = "notify", },
 };
 
 static int __net_init genl_pernet_init(struct net *net)
@@ -945,7 +996,8 @@ static int __init genl_init(void)
 	for (i = 0; i < GENL_FAM_TAB_SIZE; i++)
 		INIT_LIST_HEAD(&family_ht[i]);
 
-	err = genl_register_family_with_ops(&genl_ctrl, genl_ctrl_ops);
+	err = genl_register_family_with_ops_groups(&genl_ctrl, genl_ctrl_ops,
+						   genl_ctrl_groups);
 	if (err < 0)
 		goto problem;
 
@@ -953,10 +1005,6 @@ static int __init genl_init(void)
 	if (err)
 		goto problem;
 
-	err = genl_register_mc_group(&genl_ctrl, &notify_grp);
-	if (err < 0)
-		goto problem;
-
 	return 0;
 
 problem:
@@ -997,6 +1045,9 @@ static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group,
 int genlmsg_multicast_allns(struct genl_family *family, struct sk_buff *skb,
 			    u32 portid, unsigned int group, gfp_t flags)
 {
+	if (group >= family->n_mcgrps)
+		return -EINVAL;
+	group = family->mcgrp_offset + group;
 	return genlmsg_mcast(skb, portid, group, flags);
 }
 EXPORT_SYMBOL(genlmsg_multicast_allns);
@@ -1011,6 +1062,9 @@ void genl_notify(struct genl_family *family,
 	if (nlh)
 		report = nlmsg_report(nlh);
 
+	if (group >= family->n_mcgrps)
+		return;
+	group = family->mcgrp_offset + group;
 	nlmsg_notify(sk, skb, portid, group, report, flags);
 }
 EXPORT_SYMBOL(genl_notify);
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index 3092df313fb1..a9b2342d5253 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -30,8 +30,8 @@
 #include "nfc.h"
 #include "llcp.h"
 
-static struct genl_multicast_group nfc_genl_event_mcgrp = {
-	.name = NFC_GENL_MCAST_EVENT_NAME,
+static const struct genl_multicast_group nfc_genl_mcgrps[] = {
+	{ .name = NFC_GENL_MCAST_EVENT_NAME, },
 };
 
 static struct genl_family nfc_genl_family = {
@@ -194,8 +194,7 @@ int nfc_genl_targets_found(struct nfc_dev *dev)
 
 	genlmsg_end(msg, hdr);
 
-	return genlmsg_multicast(&nfc_genl_family, msg, 0,
-				 nfc_genl_event_mcgrp.id, GFP_ATOMIC);
+	return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC);
 
 nla_put_failure:
 	genlmsg_cancel(msg, hdr);
@@ -224,8 +223,7 @@ int nfc_genl_target_lost(struct nfc_dev *dev, u32 target_idx)
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast(&nfc_genl_family, msg, 0,
-			  nfc_genl_event_mcgrp.id, GFP_KERNEL);
+	genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);
 
 	return 0;
 
@@ -257,8 +255,7 @@ int nfc_genl_tm_activated(struct nfc_dev *dev, u32 protocol)
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast(&nfc_genl_family, msg, 0,
-			  nfc_genl_event_mcgrp.id, GFP_KERNEL);
+	genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);
 
 	return 0;
 
@@ -288,8 +285,7 @@ int nfc_genl_tm_deactivated(struct nfc_dev *dev)
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast(&nfc_genl_family, msg, 0,
-			  nfc_genl_event_mcgrp.id, GFP_KERNEL);
+	genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);
 
 	return 0;
 
@@ -322,8 +318,7 @@ int nfc_genl_device_added(struct nfc_dev *dev)
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast(&nfc_genl_family, msg, 0,
-			  nfc_genl_event_mcgrp.id, GFP_KERNEL);
+	genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);
 
 	return 0;
 
@@ -353,8 +348,7 @@ int nfc_genl_device_removed(struct nfc_dev *dev)
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast(&nfc_genl_family, msg, 0,
-			  nfc_genl_event_mcgrp.id, GFP_KERNEL);
+	genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);
 
 	return 0;
 
@@ -420,8 +414,7 @@ int nfc_genl_llc_send_sdres(struct nfc_dev *dev, struct hlist_head *sdres_list)
 
 	genlmsg_end(msg, hdr);
 
-	return genlmsg_multicast(&nfc_genl_family, msg, 0,
-				 nfc_genl_event_mcgrp.id, GFP_ATOMIC);
+	return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC);
 
 nla_put_failure:
 	genlmsg_cancel(msg, hdr);
@@ -455,8 +448,7 @@ int nfc_genl_se_added(struct nfc_dev *dev, u32 se_idx, u16 type)
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast(&nfc_genl_family, msg, 0,
-			  nfc_genl_event_mcgrp.id, GFP_KERNEL);
+	genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);
 
 	return 0;
 
@@ -487,8 +479,7 @@ int nfc_genl_se_removed(struct nfc_dev *dev, u32 se_idx)
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast(&nfc_genl_family, msg, 0,
-			  nfc_genl_event_mcgrp.id, GFP_KERNEL);
+	genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);
 
 	return 0;
 
@@ -609,8 +600,7 @@ int nfc_genl_dep_link_up_event(struct nfc_dev *dev, u32 target_idx,
 
 	dev->dep_link_up = true;
 
-	genlmsg_multicast(&nfc_genl_family, msg, 0,
-			  nfc_genl_event_mcgrp.id, GFP_ATOMIC);
+	genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC);
 
 	return 0;
 
@@ -642,8 +632,7 @@ int nfc_genl_dep_link_down_event(struct nfc_dev *dev)
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast(&nfc_genl_family, msg, 0,
-			  nfc_genl_event_mcgrp.id, GFP_ATOMIC);
+	genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC);
 
 	return 0;
 
@@ -1148,8 +1137,7 @@ int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name,
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast(&nfc_genl_family, msg, 0,
-			  nfc_genl_event_mcgrp.id, GFP_KERNEL);
+	genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);
 
 	return 0;
 
@@ -1320,8 +1308,7 @@ static void se_io_cb(void *context, u8 *apdu, size_t apdu_len, int err)
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast(&nfc_genl_family, msg, 0,
-			  nfc_genl_event_mcgrp.id, GFP_KERNEL);
+	genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);
 
 	kfree(ctx);
 
@@ -1549,15 +1536,15 @@ int __init nfc_genl_init(void)
 {
 	int rc;
 
-	rc = genl_register_family_with_ops(&nfc_genl_family, nfc_genl_ops);
+	rc = genl_register_family_with_ops_groups(&nfc_genl_family,
+						  nfc_genl_ops,
+						  nfc_genl_mcgrps);
 	if (rc)
 		return rc;
 
-	rc = genl_register_mc_group(&nfc_genl_family, &nfc_genl_event_mcgrp);
-
 	netlink_register_notifier(&nl_notifier);
 
-	return rc;
+	return 0;
 }
 
 /**
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 5c19846b1d2a..1de4d281e3f1 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -62,11 +62,10 @@
 int ovs_net_id __read_mostly;
 
 static void ovs_notify(struct genl_family *family,
-		       struct sk_buff *skb, struct genl_info *info,
-		       struct genl_multicast_group *grp)
+		       struct sk_buff *skb, struct genl_info *info)
 {
 	genl_notify(family, skb, genl_info_net(info), info->snd_portid,
-		    grp->id, info->nlhdr, GFP_KERNEL);
+		    0, info->nlhdr, GFP_KERNEL);
 }
 
 /**
@@ -878,11 +877,10 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 	ovs_unlock();
 
 	if (!IS_ERR(reply))
-		ovs_notify(&dp_flow_genl_family, reply, info,
-			   &ovs_dp_flow_multicast_group);
+		ovs_notify(&dp_flow_genl_family, reply, info);
 	else
 		genl_set_err(&dp_flow_genl_family, sock_net(skb->sk), 0,
-			     ovs_dp_flow_multicast_group.id, PTR_ERR(reply));
+			     0, PTR_ERR(reply));
 	return 0;
 
 err_flow_free:
@@ -992,8 +990,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
 	ovs_flow_free(flow, true);
 	ovs_unlock();
 
-	ovs_notify(&dp_flow_genl_family, reply, info,
-		   &ovs_dp_flow_multicast_group);
+	ovs_notify(&dp_flow_genl_family, reply, info);
 	return 0;
 unlock:
 	ovs_unlock();
@@ -1240,8 +1237,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
 
 	ovs_unlock();
 
-	ovs_notify(&dp_datapath_genl_family, reply, info,
-		   &ovs_dp_datapath_multicast_group);
+	ovs_notify(&dp_datapath_genl_family, reply, info);
 	return 0;
 
 err_destroy_local_port:
@@ -1306,8 +1302,7 @@ static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info)
 	__dp_destroy(dp);
 	ovs_unlock();
 
-	ovs_notify(&dp_datapath_genl_family, reply, info,
-		   &ovs_dp_datapath_multicast_group);
+	ovs_notify(&dp_datapath_genl_family, reply, info);
 
 	return 0;
 unlock:
@@ -1332,14 +1327,13 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
 	if (IS_ERR(reply)) {
 		err = PTR_ERR(reply);
 		genl_set_err(&dp_datapath_genl_family, sock_net(skb->sk), 0,
-			     ovs_dp_datapath_multicast_group.id, err);
+			     0, err);
 		err = 0;
 		goto unlock;
 	}
 
 	ovs_unlock();
-	ovs_notify(&dp_datapath_genl_family, reply, info,
-		   &ovs_dp_datapath_multicast_group);
+	ovs_notify(&dp_datapath_genl_family, reply, info);
 
 	return 0;
 unlock:
@@ -1601,8 +1595,7 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
 		goto exit_unlock;
 	}
 
-	ovs_notify(&dp_vport_genl_family, reply, info,
-		   &ovs_dp_vport_multicast_group);
+	ovs_notify(&dp_vport_genl_family, reply, info);
 
 exit_unlock:
 	ovs_unlock();
@@ -1649,8 +1642,7 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
 	BUG_ON(err < 0);
 
 	ovs_unlock();
-	ovs_notify(&dp_vport_genl_family, reply, info,
-		   &ovs_dp_vport_multicast_group);
+	ovs_notify(&dp_vport_genl_family, reply, info);
 	return 0;
 
 exit_free:
@@ -1687,8 +1679,7 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
 	err = 0;
 	ovs_dp_detach_port(vport);
 
-	ovs_notify(&dp_vport_genl_family, reply, info,
-		   &ovs_dp_vport_multicast_group);
+	ovs_notify(&dp_vport_genl_family, reply, info);
 
 exit_unlock:
 	ovs_unlock();
@@ -1790,7 +1781,7 @@ struct genl_family_and_ops {
 	struct genl_family *family;
 	const struct genl_ops *ops;
 	int n_ops;
-	struct genl_multicast_group *group;
+	const struct genl_multicast_group *group;
 };
 
 static const struct genl_family_and_ops dp_genl_families[] = {
@@ -1828,16 +1819,12 @@ static int dp_register_genl(void)
 
 		f->family->ops = f->ops;
 		f->family->n_ops = f->n_ops;
+		f->family->mcgrps = f->group;
+		f->family->n_mcgrps = f->group ? 1 : 0;
 		err = genl_register_family(f->family);
 		if (err)
 			goto error;
 		n_registered++;
-
-		if (f->group) {
-			err = genl_register_mc_group(f->family, f->group);
-			if (err)
-				goto error;
-		}
 	}
 
 	return 0;
diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c
index f4b66c84ea0b..2c631fe76be1 100644
--- a/net/openvswitch/dp_notify.c
+++ b/net/openvswitch/dp_notify.c
@@ -35,15 +35,13 @@ static void dp_detach_port_notify(struct vport *vport)
 	ovs_dp_detach_port(vport);
 	if (IS_ERR(notify)) {
 		genl_set_err(&dp_vport_genl_family, ovs_dp_get_net(dp), 0,
-			     ovs_dp_vport_multicast_group.id,
-			     PTR_ERR(notify));
+			     0, PTR_ERR(notify));
 		return;
 	}
 
 	genlmsg_multicast_netns(&dp_vport_genl_family,
 				ovs_dp_get_net(dp), notify, 0,
-				ovs_dp_vport_multicast_group.id,
-				GFP_KERNEL);
+				0, GFP_KERNEL);
 }
 
 void ovs_dp_notify_wq(struct work_struct *work)
diff --git a/net/wimax/op-msg.c b/net/wimax/op-msg.c
index f37dd3c5576d..c278b3356f75 100644
--- a/net/wimax/op-msg.c
+++ b/net/wimax/op-msg.c
@@ -279,8 +279,7 @@ int wimax_msg_send(struct wimax_dev *wimax_dev, struct sk_buff *skb)
 
 	d_printf(1, dev, "CTX: wimax msg, %zu bytes\n", size);
 	d_dump(2, dev, msg, size);
-	genlmsg_multicast(&wimax_gnl_family, skb, 0,
-			  wimax_gnl_mcg.id, GFP_KERNEL);
+	genlmsg_multicast(&wimax_gnl_family, skb, 0, 0, GFP_KERNEL);
 	d_printf(1, dev, "CTX: genl multicast done\n");
 	return 0;
 }
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
index 18888748e699..ef2191b969a7 100644
--- a/net/wimax/stack.c
+++ b/net/wimax/stack.c
@@ -116,8 +116,9 @@ struct sk_buff *wimax_gnl_re_state_change_alloc(
 		dev_err(dev, "RE_STCH: can't create message\n");
 		goto error_new;
 	}
-	data = genlmsg_put(report_skb, 0, wimax_gnl_mcg.id, &wimax_gnl_family,
-			   0, WIMAX_GNL_RE_STATE_CHANGE);
+	/* FIXME: sending a group ID as the seq is wrong */
+	data = genlmsg_put(report_skb, 0, wimax_gnl_family.mcgrp_offset,
+			   &wimax_gnl_family, 0, WIMAX_GNL_RE_STATE_CHANGE);
 	if (data == NULL) {
 		dev_err(dev, "RE_STCH: can't put data into message\n");
 		goto error_put;
@@ -177,8 +178,7 @@ int wimax_gnl_re_state_change_send(
 		goto out;
 	}
 	genlmsg_end(report_skb, header);
-	genlmsg_multicast(&wimax_gnl_family, report_skb, 0,
-			  wimax_gnl_mcg.id, GFP_KERNEL);
+	genlmsg_multicast(&wimax_gnl_family, report_skb, 0, 0, GFP_KERNEL);
 out:
 	d_fnend(3, dev, "(wimax_dev %p report_skb %p) = %d\n",
 		wimax_dev, report_skb, result);
@@ -580,8 +580,8 @@ struct genl_family wimax_gnl_family = {
 	.maxattr = WIMAX_GNL_ATTR_MAX,
 };
 
-struct genl_multicast_group wimax_gnl_mcg = {
-	.name = "msg",
+static const struct genl_multicast_group wimax_gnl_mcgrps[] = {
+	{ .name = "msg", },
 };
 
 
@@ -598,21 +598,18 @@ int __init wimax_subsys_init(void)
 
 	snprintf(wimax_gnl_family.name, sizeof(wimax_gnl_family.name),
 		 "WiMAX");
-	result = genl_register_family_with_ops(&wimax_gnl_family,
-					       wimax_gnl_ops);
+	result = genl_register_family_with_ops_groups(&wimax_gnl_family,
+						      wimax_gnl_ops,
+						      wimax_gnl_mcgrps);
 	if (unlikely(result < 0)) {
 		printk(KERN_ERR "cannot register generic netlink family: %d\n",
 		       result);
 		goto error_register_family;
 	}
 
-	result = genl_register_mc_group(&wimax_gnl_family, &wimax_gnl_mcg);
-	if (result < 0)
-		goto error_mc_group;
 	d_fnend(4, NULL, "() = 0\n");
 	return 0;
 
-error_mc_group:
 	genl_unregister_family(&wimax_gnl_family);
 error_register_family:
 	d_fnend(4, NULL, "() = %d\n", result);
diff --git a/net/wimax/wimax-internal.h b/net/wimax/wimax-internal.h
index 8567d3079a83..b445b82020a8 100644
--- a/net/wimax/wimax-internal.h
+++ b/net/wimax/wimax-internal.h
@@ -86,7 +86,6 @@ void wimax_rfkill_rm(struct wimax_dev *);
 
 /* generic netlink */
 extern struct genl_family wimax_gnl_family;
-extern struct genl_multicast_group wimax_gnl_mcg;
 
 /* ops */
 int wimax_gnl_doit_msg_from_user(struct sk_buff *skb, struct genl_info *info);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index f20edfd2e1f0..a1eb21073176 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -47,6 +47,25 @@ static struct genl_family nl80211_fam = {
 	.post_doit = nl80211_post_doit,
 };
 
+/* multicast groups */
+enum nl80211_multicast_groups {
+	NL80211_MCGRP_CONFIG,
+	NL80211_MCGRP_SCAN,
+	NL80211_MCGRP_REGULATORY,
+	NL80211_MCGRP_MLME,
+	NL80211_MCGRP_TESTMODE /* keep last - ifdef! */
+};
+
+static const struct genl_multicast_group nl80211_mcgrps[] = {
+	[NL80211_MCGRP_CONFIG] = { .name = "config", },
+	[NL80211_MCGRP_SCAN] = { .name = "scan", },
+	[NL80211_MCGRP_REGULATORY] = { .name = "regulatory", },
+	[NL80211_MCGRP_MLME] = { .name = "mlme", },
+#ifdef CONFIG_NL80211_TESTMODE
+	[NL80211_MCGRP_TESTMODE] = { .name = "testmode", }
+#endif
+};
+
 /* returns ERR_PTR values */
 static struct wireless_dev *
 __cfg80211_wdev_from_attrs(struct net *netns, struct nlattr **attrs)
@@ -6656,10 +6675,6 @@ static int nl80211_set_mcast_rate(struct sk_buff *skb, struct genl_info *info)
 
 
 #ifdef CONFIG_NL80211_TESTMODE
-static struct genl_multicast_group nl80211_testmode_mcgrp = {
-	.name = "testmode",
-};
-
 static int nl80211_testmode_do(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
@@ -6869,7 +6884,7 @@ void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp)
 	nla_nest_end(skb, data);
 	genlmsg_end(skb, hdr);
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), skb, 0,
-				nl80211_testmode_mcgrp.id, gfp);
+				NL80211_MCGRP_TESTMODE, gfp);
 }
 EXPORT_SYMBOL(cfg80211_testmode_event);
 #endif
@@ -9566,21 +9581,6 @@ static const struct genl_ops nl80211_ops[] = {
 	},
 };
 
-static struct genl_multicast_group nl80211_mlme_mcgrp = {
-	.name = "mlme",
-};
-
-/* multicast groups */
-static struct genl_multicast_group nl80211_config_mcgrp = {
-	.name = "config",
-};
-static struct genl_multicast_group nl80211_scan_mcgrp = {
-	.name = "scan",
-};
-static struct genl_multicast_group nl80211_regulatory_mcgrp = {
-	.name = "regulatory",
-};
-
 /* notification functions */
 
 void nl80211_notify_dev_rename(struct cfg80211_registered_device *rdev)
@@ -9598,7 +9598,7 @@ void nl80211_notify_dev_rename(struct cfg80211_registered_device *rdev)
 	}
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_config_mcgrp.id, GFP_KERNEL);
+				NL80211_MCGRP_CONFIG, GFP_KERNEL);
 }
 
 static int nl80211_add_scan_req(struct sk_buff *msg,
@@ -9708,7 +9708,7 @@ void nl80211_send_scan_start(struct cfg80211_registered_device *rdev,
 	}
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_scan_mcgrp.id, GFP_KERNEL);
+				NL80211_MCGRP_SCAN, GFP_KERNEL);
 }
 
 void nl80211_send_scan_done(struct cfg80211_registered_device *rdev,
@@ -9727,7 +9727,7 @@ void nl80211_send_scan_done(struct cfg80211_registered_device *rdev,
 	}
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_scan_mcgrp.id, GFP_KERNEL);
+				NL80211_MCGRP_SCAN, GFP_KERNEL);
 }
 
 void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev,
@@ -9746,7 +9746,7 @@ void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev,
 	}
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_scan_mcgrp.id, GFP_KERNEL);
+				NL80211_MCGRP_SCAN, GFP_KERNEL);
 }
 
 void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev,
@@ -9765,7 +9765,7 @@ void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev,
 	}
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_scan_mcgrp.id, GFP_KERNEL);
+				NL80211_MCGRP_SCAN, GFP_KERNEL);
 }
 
 void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev,
@@ -9783,7 +9783,7 @@ void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev,
 	}
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_scan_mcgrp.id, GFP_KERNEL);
+				NL80211_MCGRP_SCAN, GFP_KERNEL);
 }
 
 /*
@@ -9838,7 +9838,7 @@ void nl80211_send_reg_change_event(struct regulatory_request *request)
 
 	rcu_read_lock();
 	genlmsg_multicast_allns(&nl80211_fam, msg, 0,
-				nl80211_regulatory_mcgrp.id, GFP_ATOMIC);
+				NL80211_MCGRP_REGULATORY, GFP_ATOMIC);
 	rcu_read_unlock();
 
 	return;
@@ -9874,7 +9874,7 @@ static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev,
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, gfp);
+				NL80211_MCGRP_MLME, gfp);
 	return;
 
  nla_put_failure:
@@ -9962,7 +9962,7 @@ static void nl80211_send_mlme_timeout(struct cfg80211_registered_device *rdev,
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, gfp);
+				NL80211_MCGRP_MLME, gfp);
 	return;
 
  nla_put_failure:
@@ -10018,7 +10018,7 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, gfp);
+				NL80211_MCGRP_MLME, gfp);
 	return;
 
  nla_put_failure:
@@ -10057,7 +10057,7 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, gfp);
+				NL80211_MCGRP_MLME, gfp);
 	return;
 
  nla_put_failure:
@@ -10095,7 +10095,7 @@ void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, GFP_KERNEL);
+				NL80211_MCGRP_MLME, GFP_KERNEL);
 	return;
 
  nla_put_failure:
@@ -10129,7 +10129,7 @@ void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev,
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, gfp);
+				NL80211_MCGRP_MLME, gfp);
 	return;
 
  nla_put_failure:
@@ -10170,7 +10170,7 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr,
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, gfp);
+				NL80211_MCGRP_MLME, gfp);
 	return;
 
  nla_put_failure:
@@ -10209,7 +10209,7 @@ void nl80211_michael_mic_failure(struct cfg80211_registered_device *rdev,
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, gfp);
+				NL80211_MCGRP_MLME, gfp);
 	return;
 
  nla_put_failure:
@@ -10262,7 +10262,7 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy,
 
 	rcu_read_lock();
 	genlmsg_multicast_allns(&nl80211_fam, msg, 0,
-				nl80211_regulatory_mcgrp.id, GFP_ATOMIC);
+				NL80211_MCGRP_REGULATORY, GFP_ATOMIC);
 	rcu_read_unlock();
 
 	return;
@@ -10308,7 +10308,7 @@ static void nl80211_send_remain_on_chan_event(
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, gfp);
+				NL80211_MCGRP_MLME, gfp);
 	return;
 
  nla_put_failure:
@@ -10363,7 +10363,7 @@ void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr,
 	}
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, gfp);
+				NL80211_MCGRP_MLME, gfp);
 }
 EXPORT_SYMBOL(cfg80211_new_sta);
 
@@ -10393,7 +10393,7 @@ void cfg80211_del_sta(struct net_device *dev, const u8 *mac_addr, gfp_t gfp)
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, gfp);
+				NL80211_MCGRP_MLME, gfp);
 	return;
 
  nla_put_failure:
@@ -10429,7 +10429,7 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr,
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, gfp);
+				NL80211_MCGRP_MLME, gfp);
 	return;
 
  nla_put_failure:
@@ -10591,7 +10591,7 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, gfp);
+				NL80211_MCGRP_MLME, gfp);
 	return;
 
  nla_put_failure:
@@ -10640,7 +10640,7 @@ void cfg80211_cqm_rssi_notify(struct net_device *dev,
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, gfp);
+				NL80211_MCGRP_MLME, gfp);
 	return;
 
  nla_put_failure:
@@ -10685,7 +10685,7 @@ static void nl80211_gtk_rekey_notify(struct cfg80211_registered_device *rdev,
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, gfp);
+				NL80211_MCGRP_MLME, gfp);
 	return;
 
  nla_put_failure:
@@ -10743,7 +10743,7 @@ nl80211_pmksa_candidate_notify(struct cfg80211_registered_device *rdev,
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, gfp);
+				NL80211_MCGRP_MLME, gfp);
 	return;
 
  nla_put_failure:
@@ -10790,7 +10790,7 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev,
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, gfp);
+				NL80211_MCGRP_MLME, gfp);
 	return;
 
  nla_put_failure:
@@ -10867,7 +10867,7 @@ void cfg80211_cqm_txe_notify(struct net_device *dev,
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, gfp);
+				NL80211_MCGRP_MLME, gfp);
 	return;
 
  nla_put_failure:
@@ -10916,7 +10916,7 @@ nl80211_radar_notify(struct cfg80211_registered_device *rdev,
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, gfp);
+				NL80211_MCGRP_MLME, gfp);
 	return;
 
  nla_put_failure:
@@ -10963,7 +10963,7 @@ void cfg80211_cqm_pktloss_notify(struct net_device *dev,
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, gfp);
+				NL80211_MCGRP_MLME, gfp);
 	return;
 
  nla_put_failure:
@@ -11003,7 +11003,7 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr,
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, gfp);
+				NL80211_MCGRP_MLME, gfp);
 	return;
 
  nla_put_failure:
@@ -11155,7 +11155,7 @@ void cfg80211_report_wowlan_wakeup(struct wireless_dev *wdev,
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, gfp);
+				NL80211_MCGRP_MLME, gfp);
 	return;
 
  free_msg:
@@ -11197,7 +11197,7 @@ void cfg80211_tdls_oper_request(struct net_device *dev, const u8 *peer,
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, gfp);
+				NL80211_MCGRP_MLME, gfp);
 	return;
 
  nla_put_failure:
@@ -11280,7 +11280,7 @@ void cfg80211_ft_event(struct net_device *netdev,
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
-				nl80211_mlme_mcgrp.id, GFP_KERNEL);
+				NL80211_MCGRP_MLME, GFP_KERNEL);
 }
 EXPORT_SYMBOL(cfg80211_ft_event);
 
@@ -11329,32 +11329,11 @@ int nl80211_init(void)
 {
 	int err;
 
-	err = genl_register_family_with_ops(&nl80211_fam, nl80211_ops);
+	err = genl_register_family_with_ops_groups(&nl80211_fam, nl80211_ops,
+						   nl80211_mcgrps);
 	if (err)
 		return err;
 
-	err = genl_register_mc_group(&nl80211_fam, &nl80211_config_mcgrp);
-	if (err)
-		goto err_out;
-
-	err = genl_register_mc_group(&nl80211_fam, &nl80211_scan_mcgrp);
-	if (err)
-		goto err_out;
-
-	err = genl_register_mc_group(&nl80211_fam, &nl80211_regulatory_mcgrp);
-	if (err)
-		goto err_out;
-
-	err = genl_register_mc_group(&nl80211_fam, &nl80211_mlme_mcgrp);
-	if (err)
-		goto err_out;
-
-#ifdef CONFIG_NL80211_TESTMODE
-	err = genl_register_mc_group(&nl80211_fam, &nl80211_testmode_mcgrp);
-	if (err)
-		goto err_out;
-#endif
-
 	err = netlink_register_notifier(&nl80211_netlink_notifier);
 	if (err)
 		goto err_out;
-- 
cgit v1.3


From d558023207e008a4476a3b7bb8706b2a2bf5d84f Mon Sep 17 00:00:00 2001
From: Sasha Levin <sasha.levin@oracle.com>
Date: Tue, 19 Nov 2013 17:33:02 -0500
Subject: aio: prevent double free in ioctx_alloc

ioctx_alloc() calls aio_setup_ring() to allocate a ring. If aio_setup_ring()
fails to do so it would call aio_free_ring() before returning, but
ioctx_alloc() would call aio_free_ring() again causing a double free of
the ring.

This is easily reproducible from userspace.

Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
---
 fs/aio.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index ee77dc13d5b2..63135331cec2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -610,7 +610,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 err_cleanup:
 	aio_nr_sub(ctx->max_reqs);
 err:
-	aio_free_ring(ctx);
 	free_percpu(ctx->cpu);
 	free_percpu(ctx->reqs.pcpu_count);
 	free_percpu(ctx->users.pcpu_count);
-- 
cgit v1.3


From ddb8c45ba15149ebd41d7586261c05f7ca37f9a1 Mon Sep 17 00:00:00 2001
From: Sasha Levin <sasha.levin@oracle.com>
Date: Tue, 19 Nov 2013 17:33:03 -0500
Subject: aio: nullify aio->ring_pages after freeing it

After freeing ring_pages we leave it as is causing a dangling pointer. This
has already caused an issue so to help catching any issues in the future
NULL it out.

Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
---
 fs/aio.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 63135331cec2..ad460d78d6c5 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -196,8 +196,10 @@ static void aio_free_ring(struct kioctx *ctx)
 
 	put_aio_ring_file(ctx);
 
-	if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
+	if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
 		kfree(ctx->ring_pages);
+		ctx->ring_pages = NULL;
+	}
 }
 
 static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
-- 
cgit v1.3


From 365da4adebb1c012febf81019ad3dc5bb52e2a13 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 19 Nov 2013 17:32:43 -0500
Subject: nfsd4: fix xdr decoding of large non-write compounds

This fixes a regression from 247500820ebd02ad87525db5d9b199e5b66f6636
"nfsd4: fix decoding of compounds across page boundaries".  The previous
code was correct: argp->pagelist is initialized in
nfs4svc_deocde_compoundargs to rqstp->rq_arg.pages, and is therefore a
pointer to the page *after* the page we are currently decoding.

The reason that patch nevertheless fixed a problem with decoding
compounds containing write was a bug in the write decoding introduced by
5a80a54d21c96590d013378d8c5f65f879451ab4 "nfsd4: reorganize write
decoding", after which write decoding no longer adhered to the rule that
argp->pagelist point to the next page.

Cc: stable@vger.kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 088de1355e93..ee7237f99f54 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -141,8 +141,8 @@ xdr_error:					\
 
 static void next_decode_page(struct nfsd4_compoundargs *argp)
 {
-	argp->pagelist++;
 	argp->p = page_address(argp->pagelist[0]);
+	argp->pagelist++;
 	if (argp->pagelen < PAGE_SIZE) {
 		argp->end = argp->p + (argp->pagelen>>2);
 		argp->pagelen = 0;
@@ -1229,6 +1229,7 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
 		len -= pages * PAGE_SIZE;
 
 		argp->p = (__be32 *)page_address(argp->pagelist[0]);
+		argp->pagelist++;
 		argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE);
 	}
 	argp->p += XDR_QUADLEN(len);
-- 
cgit v1.3


From 9508c6b90b3f57ecea4e7a896cf8325400fc0c6e Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Wed, 13 Nov 2013 02:56:26 +0000
Subject: Squashfs: Refactor decompressor interface and code

The decompressor interface and code was written from
the point of view of single-threaded operation.  In doing
so it mixed a lot of single-threaded implementation specific
aspects into the decompressor code and elsewhere which makes it
difficult to seamlessly support multiple different decompressor
implementations.

This patch does the following:

1.  It removes compressor_options parsing from the decompressor
    init() function.  This allows the decompressor init() function
    to be dynamically called to instantiate multiple decompressors,
    without the compressor options needing to be read and parsed each
    time.

2.  It moves threading and all sleeping operations out of the
    decompressors.  In doing so, it makes the decompressors
    non-blocking wrappers which only deal with interfacing with
    the decompressor implementation.

3. It splits decompressor.[ch] into decompressor generic functions
   in decompressor.[ch], and moves the single threaded
   decompressor implementation into decompressor_single.c.

The result of this patch is Squashfs should now be able to
support multiple decompressors by adding new decompressor_xxx.c
files with specialised implementations of the functions in
decompressor_single.c

Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
Reviewed-by: Minchan Kim <minchan@kernel.org>
---
 fs/squashfs/Makefile              |  2 +-
 fs/squashfs/block.c               | 11 +++--
 fs/squashfs/decompressor.c        | 47 ++++++++++++++-------
 fs/squashfs/decompressor.h        | 21 +++------
 fs/squashfs/decompressor_single.c | 86 +++++++++++++++++++++++++++++++++++++
 fs/squashfs/lzo_wrapper.c         | 24 +++--------
 fs/squashfs/squashfs.h            |  9 +++-
 fs/squashfs/squashfs_fs_sb.h      |  3 +-
 fs/squashfs/super.c               | 10 ++---
 fs/squashfs/xz_wrapper.c          | 89 +++++++++++++++++++++------------------
 fs/squashfs/zlib_wrapper.c        | 50 +++++++---------------
 11 files changed, 216 insertions(+), 136 deletions(-)
 create mode 100644 fs/squashfs/decompressor_single.c

(limited to 'fs')

diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 110b0476f3b4..c223c8439c21 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
-squashfs-y += namei.o super.o symlink.o decompressor.o
+squashfs-y += namei.o super.o symlink.o decompressor.o decompressor_single.o
 squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
 squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
 squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 41d108ecc9be..4dd402597f22 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -93,7 +93,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
 	struct buffer_head **bh;
 	int offset = index & ((1 << msblk->devblksize_log2) - 1);
 	u64 cur_index = index >> msblk->devblksize_log2;
-	int bytes, compressed, b = 0, k = 0, page = 0, avail;
+	int bytes, compressed, b = 0, k = 0, page = 0, avail, i;
 
 	bh = kcalloc(((srclength + msblk->devblksize - 1)
 		>> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL);
@@ -158,6 +158,12 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
 		ll_rw_block(READ, b - 1, bh + 1);
 	}
 
+	for (i = 0; i < b; i++) {
+		wait_on_buffer(bh[i]);
+		if (!buffer_uptodate(bh[i]))
+			goto block_release;
+	}
+
 	if (compressed) {
 		length = squashfs_decompress(msblk, buffer, bh, b, offset,
 			 length, srclength, pages);
@@ -172,9 +178,6 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
 		for (bytes = length; k < b; k++) {
 			in = min(bytes, msblk->devblksize - offset);
 			bytes -= in;
-			wait_on_buffer(bh[k]);
-			if (!buffer_uptodate(bh[k]))
-				goto block_release;
 			while (in) {
 				if (pg_offset == PAGE_CACHE_SIZE) {
 					page++;
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 3f6271d86abc..234291f79ba5 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -37,29 +37,29 @@
  */
 
 static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
-	NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
+	NULL, NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
 };
 
 #ifndef CONFIG_SQUASHFS_LZO
 static const struct squashfs_decompressor squashfs_lzo_comp_ops = {
-	NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
+	NULL, NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
 };
 #endif
 
 #ifndef CONFIG_SQUASHFS_XZ
 static const struct squashfs_decompressor squashfs_xz_comp_ops = {
-	NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0
+	NULL, NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0
 };
 #endif
 
 #ifndef CONFIG_SQUASHFS_ZLIB
 static const struct squashfs_decompressor squashfs_zlib_comp_ops = {
-	NULL, NULL, NULL, ZLIB_COMPRESSION, "zlib", 0
+	NULL, NULL, NULL, NULL, ZLIB_COMPRESSION, "zlib", 0
 };
 #endif
 
 static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
-	NULL, NULL, NULL, 0, "unknown", 0
+	NULL, NULL, NULL, NULL, 0, "unknown", 0
 };
 
 static const struct squashfs_decompressor *decompressor[] = {
@@ -83,10 +83,10 @@ const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
 }
 
 
-void *squashfs_decompressor_init(struct super_block *sb, unsigned short flags)
+static void *get_comp_opts(struct super_block *sb, unsigned short flags)
 {
 	struct squashfs_sb_info *msblk = sb->s_fs_info;
-	void *strm, *buffer = NULL;
+	void *buffer = NULL, *comp_opts;
 	int length = 0;
 
 	/*
@@ -94,23 +94,40 @@ void *squashfs_decompressor_init(struct super_block *sb, unsigned short flags)
 	 */
 	if (SQUASHFS_COMP_OPTS(flags)) {
 		buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
-		if (buffer == NULL)
-			return ERR_PTR(-ENOMEM);
+		if (buffer == NULL) {
+			comp_opts = ERR_PTR(-ENOMEM);
+			goto out;
+		}
 
 		length = squashfs_read_data(sb, &buffer,
 			sizeof(struct squashfs_super_block), 0, NULL,
-			PAGE_CACHE_SIZE, 1);
+				PAGE_CACHE_SIZE, 1);
 
 		if (length < 0) {
-			strm = ERR_PTR(length);
-			goto finished;
+			comp_opts = ERR_PTR(length);
+			goto out;
 		}
 	}
 
-	strm = msblk->decompressor->init(msblk, buffer, length);
+	comp_opts = squashfs_comp_opts(msblk, buffer, length);
 
-finished:
+out:
 	kfree(buffer);
+	return comp_opts;
+}
+
+
+void *squashfs_decompressor_setup(struct super_block *sb, unsigned short flags)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	void *stream, *comp_opts = get_comp_opts(sb, flags);
+
+	if (IS_ERR(comp_opts))
+		return comp_opts;
+
+	stream = squashfs_decompressor_create(msblk, comp_opts);
+	if (IS_ERR(stream))
+		kfree(comp_opts);
 
-	return strm;
+	return stream;
 }
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 330073e29029..6cdb20a3878a 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -24,28 +24,21 @@
  */
 
 struct squashfs_decompressor {
-	void	*(*init)(struct squashfs_sb_info *, void *, int);
+	void	*(*init)(struct squashfs_sb_info *, void *);
+	void	*(*comp_opts)(struct squashfs_sb_info *, void *, int);
 	void	(*free)(void *);
-	int	(*decompress)(struct squashfs_sb_info *, void **,
+	int	(*decompress)(struct squashfs_sb_info *, void *, void **,
 		struct buffer_head **, int, int, int, int, int);
 	int	id;
 	char	*name;
 	int	supported;
 };
 
-static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk,
-	void *s)
+static inline void *squashfs_comp_opts(struct squashfs_sb_info *msblk,
+							void *buff, int length)
 {
-	if (msblk->decompressor)
-		msblk->decompressor->free(s);
-}
-
-static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
-	void **buffer, struct buffer_head **bh, int b, int offset, int length,
-	int srclength, int pages)
-{
-	return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
-		length, srclength, pages);
+	return msblk->decompressor->comp_opts ?
+		msblk->decompressor->comp_opts(msblk, buff, length) : NULL;
 }
 
 #ifdef CONFIG_SQUASHFS_XZ
diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c
new file mode 100644
index 000000000000..f857cf6f22d4
--- /dev/null
+++ b/fs/squashfs/decompressor_single.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "decompressor.h"
+#include "squashfs.h"
+
+/*
+ * This file implements single-threaded decompression in the
+ * decompressor framework
+ */
+
+struct squashfs_stream {
+	void		*stream;
+	struct mutex	mutex;
+};
+
+void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+						void *comp_opts)
+{
+	struct squashfs_stream *stream;
+	int err = -ENOMEM;
+
+	stream = kmalloc(sizeof(*stream), GFP_KERNEL);
+	if (stream == NULL)
+		goto out;
+
+	stream->stream = msblk->decompressor->init(msblk, comp_opts);
+	if (IS_ERR(stream->stream)) {
+		err = PTR_ERR(stream->stream);
+		goto out;
+	}
+
+	kfree(comp_opts);
+	mutex_init(&stream->mutex);
+	return stream;
+
+out:
+	kfree(stream);
+	return ERR_PTR(err);
+}
+
+void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+{
+	struct squashfs_stream *stream = msblk->stream;
+
+	if (stream) {
+		msblk->decompressor->free(stream->stream);
+		kfree(stream);
+	}
+}
+
+int squashfs_decompress(struct squashfs_sb_info *msblk,
+	void **buffer, struct buffer_head **bh, int b, int offset, int length,
+	int srclength, int pages)
+{
+	int res;
+	struct squashfs_stream *stream = msblk->stream;
+
+	mutex_lock(&stream->mutex);
+	res = msblk->decompressor->decompress(msblk, stream->stream, buffer,
+		bh, b, offset, length, srclength, pages);
+	mutex_unlock(&stream->mutex);
+
+	if (res < 0)
+		ERROR("%s decompression failed, data probably corrupt\n",
+			msblk->decompressor->name);
+
+	return res;
+}
+
+int squashfs_max_decompressors(void)
+{
+	return 1;
+}
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 00f4dfc5f088..75c3b5779172 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -37,7 +37,7 @@ struct squashfs_lzo {
 	void	*output;
 };
 
-static void *lzo_init(struct squashfs_sb_info *msblk, void *buff, int len)
+static void *lzo_init(struct squashfs_sb_info *msblk, void *buff)
 {
 	int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
 
@@ -74,22 +74,16 @@ static void lzo_free(void *strm)
 }
 
 
-static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer,
-	struct buffer_head **bh, int b, int offset, int length, int srclength,
-	int pages)
+static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
+	void **buffer, struct buffer_head **bh, int b, int offset, int length,
+	int srclength, int pages)
 {
-	struct squashfs_lzo *stream = msblk->stream;
+	struct squashfs_lzo *stream = strm;
 	void *buff = stream->input;
 	int avail, i, bytes = length, res;
 	size_t out_len = srclength;
 
-	mutex_lock(&msblk->read_data_mutex);
-
 	for (i = 0; i < b; i++) {
-		wait_on_buffer(bh[i]);
-		if (!buffer_uptodate(bh[i]))
-			goto block_release;
-
 		avail = min(bytes, msblk->devblksize - offset);
 		memcpy(buff, bh[i]->b_data + offset, avail);
 		buff += avail;
@@ -111,17 +105,9 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer,
 		bytes -= avail;
 	}
 
-	mutex_unlock(&msblk->read_data_mutex);
 	return res;
 
-block_release:
-	for (; i < b; i++)
-		put_bh(bh[i]);
-
 failed:
-	mutex_unlock(&msblk->read_data_mutex);
-
-	ERROR("lzo decompression failed, data probably corrupt\n");
 	return -EIO;
 }
 
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index d1266516ed08..2e2751df8452 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -48,7 +48,14 @@ extern void *squashfs_read_table(struct super_block *, u64, int);
 
 /* decompressor.c */
 extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
-extern void *squashfs_decompressor_init(struct super_block *, unsigned short);
+extern void *squashfs_decompressor_setup(struct super_block *, unsigned short);
+
+/* decompressor_xxx.c */
+extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *);
+extern void squashfs_decompressor_destroy(struct squashfs_sb_info *);
+extern int squashfs_decompress(struct squashfs_sb_info *, void **,
+	struct buffer_head **, int, int, int, int, int);
+extern int squashfs_max_decompressors(void);
 
 /* export.c */
 extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64,
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 52934a22f296..9cdcf4150d59 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -63,10 +63,9 @@ struct squashfs_sb_info {
 	__le64					*id_table;
 	__le64					*fragment_index;
 	__le64					*xattr_id_table;
-	struct mutex				read_data_mutex;
 	struct mutex				meta_index_mutex;
 	struct meta_index			*meta_index;
-	void					*stream;
+	struct squashfs_stream			*stream;
 	__le64					*inode_lookup_table;
 	u64					inode_table;
 	u64					directory_table;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 60553a9053ca..202df6312d4e 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -98,7 +98,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 	msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);
 	msblk->devblksize_log2 = ffz(~msblk->devblksize);
 
-	mutex_init(&msblk->read_data_mutex);
 	mutex_init(&msblk->meta_index_mutex);
 
 	/*
@@ -206,13 +205,14 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 
 	/* Allocate read_page block */
-	msblk->read_page = squashfs_cache_init("data", 1, msblk->block_size);
+	msblk->read_page = squashfs_cache_init("data",
+		squashfs_max_decompressors(), msblk->block_size);
 	if (msblk->read_page == NULL) {
 		ERROR("Failed to allocate read_page block\n");
 		goto failed_mount;
 	}
 
-	msblk->stream = squashfs_decompressor_init(sb, flags);
+	msblk->stream = squashfs_decompressor_setup(sb, flags);
 	if (IS_ERR(msblk->stream)) {
 		err = PTR_ERR(msblk->stream);
 		msblk->stream = NULL;
@@ -336,7 +336,7 @@ failed_mount:
 	squashfs_cache_delete(msblk->block_cache);
 	squashfs_cache_delete(msblk->fragment_cache);
 	squashfs_cache_delete(msblk->read_page);
-	squashfs_decompressor_free(msblk, msblk->stream);
+	squashfs_decompressor_destroy(msblk);
 	kfree(msblk->inode_lookup_table);
 	kfree(msblk->fragment_index);
 	kfree(msblk->id_table);
@@ -383,7 +383,7 @@ static void squashfs_put_super(struct super_block *sb)
 		squashfs_cache_delete(sbi->block_cache);
 		squashfs_cache_delete(sbi->fragment_cache);
 		squashfs_cache_delete(sbi->read_page);
-		squashfs_decompressor_free(sbi, sbi->stream);
+		squashfs_decompressor_destroy(sbi);
 		kfree(sbi->id_table);
 		kfree(sbi->fragment_index);
 		kfree(sbi->meta_index);
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index 1760b7d108f6..5d1d07cca6b4 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -38,38 +38,63 @@ struct squashfs_xz {
 	struct xz_buf buf;
 };
 
-struct comp_opts {
+struct disk_comp_opts {
 	__le32 dictionary_size;
 	__le32 flags;
 };
 
-static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff,
-	int len)
+struct comp_opts {
+	int dict_size;
+};
+
+static void *squashfs_xz_comp_opts(struct squashfs_sb_info *msblk,
+	void *buff, int len)
 {
-	struct comp_opts *comp_opts = buff;
-	struct squashfs_xz *stream;
-	int dict_size = msblk->block_size;
-	int err, n;
+	struct disk_comp_opts *comp_opts = buff;
+	struct comp_opts *opts;
+	int err = 0, n;
+
+	opts = kmalloc(sizeof(*opts), GFP_KERNEL);
+	if (opts == NULL) {
+		err = -ENOMEM;
+		goto out2;
+	}
 
 	if (comp_opts) {
 		/* check compressor options are the expected length */
 		if (len < sizeof(*comp_opts)) {
 			err = -EIO;
-			goto failed;
+			goto out;
 		}
 
-		dict_size = le32_to_cpu(comp_opts->dictionary_size);
+		opts->dict_size = le32_to_cpu(comp_opts->dictionary_size);
 
 		/* the dictionary size should be 2^n or 2^n+2^(n+1) */
-		n = ffs(dict_size) - 1;
-		if (dict_size != (1 << n) && dict_size != (1 << n) +
+		n = ffs(opts->dict_size) - 1;
+		if (opts->dict_size != (1 << n) && opts->dict_size != (1 << n) +
 						(1 << (n + 1))) {
 			err = -EIO;
-			goto failed;
+			goto out;
 		}
-	}
+	} else
+		/* use defaults */
+		opts->dict_size = max_t(int, msblk->block_size,
+							SQUASHFS_METADATA_SIZE);
 
-	dict_size = max_t(int, dict_size, SQUASHFS_METADATA_SIZE);
+	return opts;
+
+out:
+	kfree(opts);
+out2:
+	return ERR_PTR(err);
+}
+
+
+static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff)
+{
+	struct comp_opts *comp_opts = buff;
+	struct squashfs_xz *stream;
+	int err;
 
 	stream = kmalloc(sizeof(*stream), GFP_KERNEL);
 	if (stream == NULL) {
@@ -77,7 +102,7 @@ static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff,
 		goto failed;
 	}
 
-	stream->state = xz_dec_init(XZ_PREALLOC, dict_size);
+	stream->state = xz_dec_init(XZ_PREALLOC, comp_opts->dict_size);
 	if (stream->state == NULL) {
 		kfree(stream);
 		err = -ENOMEM;
@@ -103,15 +128,13 @@ static void squashfs_xz_free(void *strm)
 }
 
 
-static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
-	struct buffer_head **bh, int b, int offset, int length, int srclength,
-	int pages)
+static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
+	void **buffer, struct buffer_head **bh, int b, int offset, int length,
+	int srclength, int pages)
 {
 	enum xz_ret xz_err;
 	int avail, total = 0, k = 0, page = 0;
-	struct squashfs_xz *stream = msblk->stream;
-
-	mutex_lock(&msblk->read_data_mutex);
+	struct squashfs_xz *stream = strm;
 
 	xz_dec_reset(stream->state);
 	stream->buf.in_pos = 0;
@@ -124,10 +147,6 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
 		if (stream->buf.in_pos == stream->buf.in_size && k < b) {
 			avail = min(length, msblk->devblksize - offset);
 			length -= avail;
-			wait_on_buffer(bh[k]);
-			if (!buffer_uptodate(bh[k]))
-				goto release_mutex;
-
 			stream->buf.in = bh[k]->b_data + offset;
 			stream->buf.in_size = avail;
 			stream->buf.in_pos = 0;
@@ -147,23 +166,12 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
 			put_bh(bh[k++]);
 	} while (xz_err == XZ_OK);
 
-	if (xz_err != XZ_STREAM_END) {
-		ERROR("xz_dec_run error, data probably corrupt\n");
-		goto release_mutex;
-	}
-
-	if (k < b) {
-		ERROR("xz_uncompress error, input remaining\n");
-		goto release_mutex;
-	}
-
-	total += stream->buf.out_pos;
-	mutex_unlock(&msblk->read_data_mutex);
-	return total;
+	if (xz_err != XZ_STREAM_END || k < b)
+		goto out;
 
-release_mutex:
-	mutex_unlock(&msblk->read_data_mutex);
+	return total + stream->buf.out_pos;
 
+out:
 	for (; k < b; k++)
 		put_bh(bh[k]);
 
@@ -172,6 +180,7 @@ release_mutex:
 
 const struct squashfs_decompressor squashfs_xz_comp_ops = {
 	.init = squashfs_xz_init,
+	.comp_opts = squashfs_xz_comp_opts,
 	.free = squashfs_xz_free,
 	.decompress = squashfs_xz_uncompress,
 	.id = XZ_COMPRESSION,
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 55d918fd2d86..bb049027d15c 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -33,7 +33,7 @@
 #include "squashfs.h"
 #include "decompressor.h"
 
-static void *zlib_init(struct squashfs_sb_info *dummy, void *buff, int len)
+static void *zlib_init(struct squashfs_sb_info *dummy, void *buff)
 {
 	z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL);
 	if (stream == NULL)
@@ -61,15 +61,13 @@ static void zlib_free(void *strm)
 }
 
 
-static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
-	struct buffer_head **bh, int b, int offset, int length, int srclength,
-	int pages)
+static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
+	void **buffer, struct buffer_head **bh, int b, int offset, int length,
+	int srclength, int pages)
 {
 	int zlib_err, zlib_init = 0;
 	int k = 0, page = 0;
-	z_stream *stream = msblk->stream;
-
-	mutex_lock(&msblk->read_data_mutex);
+	z_stream *stream = strm;
 
 	stream->avail_out = 0;
 	stream->avail_in = 0;
@@ -78,10 +76,6 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
 		if (stream->avail_in == 0 && k < b) {
 			int avail = min(length, msblk->devblksize - offset);
 			length -= avail;
-			wait_on_buffer(bh[k]);
-			if (!buffer_uptodate(bh[k]))
-				goto release_mutex;
-
 			stream->next_in = bh[k]->b_data + offset;
 			stream->avail_in = avail;
 			offset = 0;
@@ -94,12 +88,8 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
 
 		if (!zlib_init) {
 			zlib_err = zlib_inflateInit(stream);
-			if (zlib_err != Z_OK) {
-				ERROR("zlib_inflateInit returned unexpected "
-					"result 0x%x, srclength %d\n",
-					zlib_err, srclength);
-				goto release_mutex;
-			}
+			if (zlib_err != Z_OK)
+				goto out;
 			zlib_init = 1;
 		}
 
@@ -109,29 +99,19 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
 			put_bh(bh[k++]);
 	} while (zlib_err == Z_OK);
 
-	if (zlib_err != Z_STREAM_END) {
-		ERROR("zlib_inflate error, data probably corrupt\n");
-		goto release_mutex;
-	}
+	if (zlib_err != Z_STREAM_END)
+		goto out;
 
 	zlib_err = zlib_inflateEnd(stream);
-	if (zlib_err != Z_OK) {
-		ERROR("zlib_inflate error, data probably corrupt\n");
-		goto release_mutex;
-	}
-
-	if (k < b) {
-		ERROR("zlib_uncompress error, data remaining\n");
-		goto release_mutex;
-	}
+	if (zlib_err != Z_OK)
+		goto out;
 
-	length = stream->total_out;
-	mutex_unlock(&msblk->read_data_mutex);
-	return length;
+	if (k < b)
+		goto out;
 
-release_mutex:
-	mutex_unlock(&msblk->read_data_mutex);
+	return stream->total_out;
 
+out:
 	for (; k < b; k++)
 		put_bh(bh[k]);
 
-- 
cgit v1.3


From cd59c2ec5f37a2bc1315c9324aab6c21d43ffa1a Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 28 Oct 2013 14:26:30 +0900
Subject: squashfs: Enhance parallel I/O

Now squashfs have used for only one stream buffer for decompression
so it hurts parallel read performance so this patch supports
multiple decompressor to enhance performance parallel I/O.

Four 1G file dd read on KVM machine which has 2 CPU and 4G memory.

dd if=test/test1.dat of=/dev/null &
dd if=test/test2.dat of=/dev/null &
dd if=test/test3.dat of=/dev/null &
dd if=test/test4.dat of=/dev/null &

old : 1m39s -> new : 9s

* From v1
  * Change comp_strm with decomp_strm - Phillip
  * Change/add comments - Phillip

Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
---
 fs/squashfs/Kconfig              |  13 +++
 fs/squashfs/Makefile             |   9 +-
 fs/squashfs/decompressor_multi.c | 200 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 221 insertions(+), 1 deletion(-)
 create mode 100644 fs/squashfs/decompressor_multi.c

(limited to 'fs')

diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index c70111ebefd4..1c6d340fc61f 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -63,6 +63,19 @@ config SQUASHFS_LZO
 
 	  If unsure, say N.
 
+config SQUASHFS_MULTI_DECOMPRESSOR
+	bool "Use multiple decompressors for handling parallel I/O"
+	depends on SQUASHFS
+	help
+	  By default Squashfs uses a single decompressor but it gives
+	  poor performance on parallel I/O workloads when using multiple CPU
+	  machines due to waiting on decompressor availability.
+
+	  If you have a parallel I/O workload and your system has enough memory,
+	  using this option may improve overall I/O performance.
+
+	  If unsure, say N.
+
 config SQUASHFS_XZ
 	bool "Include support for XZ compressed file systems"
 	depends on SQUASHFS
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index c223c8439c21..dfebc3b12d61 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -4,8 +4,15 @@
 
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
-squashfs-y += namei.o super.o symlink.o decompressor.o decompressor_single.o
+squashfs-y += namei.o super.o symlink.o decompressor.o
+
 squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
 squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
 squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o
 squashfs-$(CONFIG_SQUASHFS_ZLIB) += zlib_wrapper.o
+
+ifdef CONFIG_SQUASHFS_MULTI_DECOMPRESSOR
+	squashfs-y		+= decompressor_multi.o
+else
+	squashfs-y		+= decompressor_single.o
+endif
diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c
new file mode 100644
index 000000000000..462731db5130
--- /dev/null
+++ b/fs/squashfs/decompressor_multi.c
@@ -0,0 +1,200 @@
+/*
+ *  Copyright (c) 2013
+ *  Minchan Kim <minchan@kernel.org>
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ */
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/cpumask.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "decompressor.h"
+#include "squashfs.h"
+
+/*
+ * This file implements multi-threaded decompression in the
+ * decompressor framework
+ */
+
+
+/*
+ * The reason that multiply two is that a CPU can request new I/O
+ * while it is waiting previous request.
+ */
+#define MAX_DECOMPRESSOR	(num_online_cpus() * 2)
+
+
+int squashfs_max_decompressors(void)
+{
+	return MAX_DECOMPRESSOR;
+}
+
+
+struct squashfs_stream {
+	void			*comp_opts;
+	struct list_head	strm_list;
+	struct mutex		mutex;
+	int			avail_decomp;
+	wait_queue_head_t	wait;
+};
+
+
+struct decomp_stream {
+	void *stream;
+	struct list_head list;
+};
+
+
+static void put_decomp_stream(struct decomp_stream *decomp_strm,
+				struct squashfs_stream *stream)
+{
+	mutex_lock(&stream->mutex);
+	list_add(&decomp_strm->list, &stream->strm_list);
+	mutex_unlock(&stream->mutex);
+	wake_up(&stream->wait);
+}
+
+void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+				void *comp_opts)
+{
+	struct squashfs_stream *stream;
+	struct decomp_stream *decomp_strm = NULL;
+	int err = -ENOMEM;
+
+	stream = kzalloc(sizeof(*stream), GFP_KERNEL);
+	if (!stream)
+		goto out;
+
+	stream->comp_opts = comp_opts;
+	mutex_init(&stream->mutex);
+	INIT_LIST_HEAD(&stream->strm_list);
+	init_waitqueue_head(&stream->wait);
+
+	/*
+	 * We should have a decompressor at least as default
+	 * so if we fail to allocate new decompressor dynamically,
+	 * we could always fall back to default decompressor and
+	 * file system works.
+	 */
+	decomp_strm = kmalloc(sizeof(*decomp_strm), GFP_KERNEL);
+	if (!decomp_strm)
+		goto out;
+
+	decomp_strm->stream = msblk->decompressor->init(msblk,
+						stream->comp_opts);
+	if (IS_ERR(decomp_strm->stream)) {
+		err = PTR_ERR(decomp_strm->stream);
+		goto out;
+	}
+
+	list_add(&decomp_strm->list, &stream->strm_list);
+	stream->avail_decomp = 1;
+	return stream;
+
+out:
+	kfree(decomp_strm);
+	kfree(stream);
+	return ERR_PTR(err);
+}
+
+
+void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+{
+	struct squashfs_stream *stream = msblk->stream;
+	if (stream) {
+		struct decomp_stream *decomp_strm;
+
+		while (!list_empty(&stream->strm_list)) {
+			decomp_strm = list_entry(stream->strm_list.prev,
+						struct decomp_stream, list);
+			list_del(&decomp_strm->list);
+			msblk->decompressor->free(decomp_strm->stream);
+			kfree(decomp_strm);
+			stream->avail_decomp--;
+		}
+	}
+
+	WARN_ON(stream->avail_decomp);
+	kfree(stream->comp_opts);
+	kfree(stream);
+}
+
+
+static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk,
+					struct squashfs_stream *stream)
+{
+	struct decomp_stream *decomp_strm;
+
+	while (1) {
+		mutex_lock(&stream->mutex);
+
+		/* There is available decomp_stream */
+		if (!list_empty(&stream->strm_list)) {
+			decomp_strm = list_entry(stream->strm_list.prev,
+				struct decomp_stream, list);
+			list_del(&decomp_strm->list);
+			mutex_unlock(&stream->mutex);
+			break;
+		}
+
+		/*
+		 * If there is no available decomp and already full,
+		 * let's wait for releasing decomp from other users.
+		 */
+		if (stream->avail_decomp >= MAX_DECOMPRESSOR)
+			goto wait;
+
+		/* Let's allocate new decomp */
+		decomp_strm = kmalloc(sizeof(*decomp_strm), GFP_KERNEL);
+		if (!decomp_strm)
+			goto wait;
+
+		decomp_strm->stream = msblk->decompressor->init(msblk,
+						stream->comp_opts);
+		if (IS_ERR(decomp_strm->stream)) {
+			kfree(decomp_strm);
+			goto wait;
+		}
+
+		stream->avail_decomp++;
+		WARN_ON(stream->avail_decomp > MAX_DECOMPRESSOR);
+
+		mutex_unlock(&stream->mutex);
+		break;
+wait:
+		/*
+		 * If system memory is tough, let's for other's
+		 * releasing instead of hurting VM because it could
+		 * make page cache thrashing.
+		 */
+		mutex_unlock(&stream->mutex);
+		wait_event(stream->wait,
+			!list_empty(&stream->strm_list));
+	}
+
+	return decomp_strm;
+}
+
+
+int squashfs_decompress(struct squashfs_sb_info *msblk,
+	void **buffer, struct buffer_head **bh, int b, int offset, int length,
+	int srclength, int pages)
+{
+	int res;
+	struct squashfs_stream *stream = msblk->stream;
+	struct decomp_stream *decomp_stream = get_decomp_stream(msblk, stream);
+	res = msblk->decompressor->decompress(msblk, decomp_stream->stream,
+		buffer, bh, b, offset, length, srclength, pages);
+	put_decomp_stream(decomp_stream, stream);
+	if (res < 0)
+		ERROR("%s decompression failed, data probably corrupt\n",
+			msblk->decompressor->name);
+	return res;
+}
-- 
cgit v1.3


From d208383d640727b70cd6689bc17e67e9b5ebf4ff Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Mon, 18 Nov 2013 02:31:36 +0000
Subject: Squashfs: add multi-threaded decompression using percpu variable

Add a multi-threaded decompression implementation which uses
percpu variables.

Using percpu variables has advantages and disadvantages over
implementations which do not use percpu variables.

Advantages:
  * the nature of percpu variables ensures decompression is
    load-balanced across the multiple cores.
  * simplicity.

Disadvantages: it limits decompression to one thread per core.

Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
---
 fs/squashfs/Kconfig                     | 57 ++++++++++++++-----
 fs/squashfs/Makefile                    | 10 +---
 fs/squashfs/decompressor_multi_percpu.c | 98 +++++++++++++++++++++++++++++++++
 3 files changed, 145 insertions(+), 20 deletions(-)
 create mode 100644 fs/squashfs/decompressor_multi_percpu.c

(limited to 'fs')

diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 1c6d340fc61f..159bd6676dc2 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -25,6 +25,50 @@ config SQUASHFS
 
 	  If unsure, say N.
 
+choice
+	prompt "Decompressor parallelisation options"
+	depends on SQUASHFS
+	help
+	  Squashfs now supports three parallelisation options for
+	  decompression.  Each one exhibits various trade-offs between
+	  decompression performance and CPU and memory usage.
+
+	  If in doubt, select "Single threaded compression"
+
+config SQUASHFS_DECOMP_SINGLE
+	bool "Single threaded compression"
+	help
+	  Traditionally Squashfs has used single-threaded decompression.
+	  Only one block (data or metadata) can be decompressed at any
+	  one time.  This limits CPU and memory usage to a minimum.
+
+config SQUASHFS_DECOMP_MULTI
+	bool "Use multiple decompressors for parallel I/O"
+	help
+	  By default Squashfs uses a single decompressor but it gives
+	  poor performance on parallel I/O workloads when using multiple CPU
+	  machines due to waiting on decompressor availability.
+
+	  If you have a parallel I/O workload and your system has enough memory,
+	  using this option may improve overall I/O performance.
+
+	  This decompressor implementation uses up to two parallel
+	  decompressors per core.  It dynamically allocates decompressors
+	  on a demand basis.
+
+config SQUASHFS_DECOMP_MULTI_PERCPU
+	bool "Use percpu multiple decompressors for parallel I/O"
+	help
+	  By default Squashfs uses a single decompressor but it gives
+	  poor performance on parallel I/O workloads when using multiple CPU
+	  machines due to waiting on decompressor availability.
+
+	  This decompressor implementation uses a maximum of one
+	  decompressor per core.  It uses percpu variables to ensure
+	  decompression is load-balanced across the cores.
+
+endchoice
+
 config SQUASHFS_XATTR
 	bool "Squashfs XATTR support"
 	depends on SQUASHFS
@@ -63,19 +107,6 @@ config SQUASHFS_LZO
 
 	  If unsure, say N.
 
-config SQUASHFS_MULTI_DECOMPRESSOR
-	bool "Use multiple decompressors for handling parallel I/O"
-	depends on SQUASHFS
-	help
-	  By default Squashfs uses a single decompressor but it gives
-	  poor performance on parallel I/O workloads when using multiple CPU
-	  machines due to waiting on decompressor availability.
-
-	  If you have a parallel I/O workload and your system has enough memory,
-	  using this option may improve overall I/O performance.
-
-	  If unsure, say N.
-
 config SQUASHFS_XZ
 	bool "Include support for XZ compressed file systems"
 	depends on SQUASHFS
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index dfebc3b12d61..5833b96ee69c 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,14 +5,10 @@
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
 squashfs-y += namei.o super.o symlink.o decompressor.o
-
+squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o
+squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o
+squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o
 squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
 squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
 squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o
 squashfs-$(CONFIG_SQUASHFS_ZLIB) += zlib_wrapper.o
-
-ifdef CONFIG_SQUASHFS_MULTI_DECOMPRESSOR
-	squashfs-y		+= decompressor_multi.o
-else
-	squashfs-y		+= decompressor_single.o
-endif
diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c
new file mode 100644
index 000000000000..0e7b679bc4ad
--- /dev/null
+++ b/fs/squashfs/decompressor_multi_percpu.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/percpu.h>
+#include <linux/buffer_head.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "decompressor.h"
+#include "squashfs.h"
+
+/*
+ * This file implements multi-threaded decompression using percpu
+ * variables, one thread per cpu core.
+ */
+
+struct squashfs_stream {
+	void		*stream;
+};
+
+void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+						void *comp_opts)
+{
+	struct squashfs_stream *stream;
+	struct squashfs_stream __percpu *percpu;
+	int err, cpu;
+
+	percpu = alloc_percpu(struct squashfs_stream);
+	if (percpu == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	for_each_possible_cpu(cpu) {
+		stream = per_cpu_ptr(percpu, cpu);
+		stream->stream = msblk->decompressor->init(msblk, comp_opts);
+		if (IS_ERR(stream->stream)) {
+			err = PTR_ERR(stream->stream);
+			goto out;
+		}
+	}
+
+	kfree(comp_opts);
+	return (__force void *) percpu;
+
+out:
+	for_each_possible_cpu(cpu) {
+		stream = per_cpu_ptr(percpu, cpu);
+		if (!IS_ERR_OR_NULL(stream->stream))
+			msblk->decompressor->free(stream->stream);
+	}
+	free_percpu(percpu);
+	return ERR_PTR(err);
+}
+
+void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+{
+	struct squashfs_stream __percpu *percpu =
+			(struct squashfs_stream __percpu *) msblk->stream;
+	struct squashfs_stream *stream;
+	int cpu;
+
+	if (msblk->stream) {
+		for_each_possible_cpu(cpu) {
+			stream = per_cpu_ptr(percpu, cpu);
+			msblk->decompressor->free(stream->stream);
+		}
+		free_percpu(percpu);
+	}
+}
+
+int squashfs_decompress(struct squashfs_sb_info *msblk,
+	void **buffer, struct buffer_head **bh, int b, int offset, int length,
+	int srclength, int pages)
+{
+	struct squashfs_stream __percpu *percpu =
+			(struct squashfs_stream __percpu *) msblk->stream;
+	struct squashfs_stream *stream = get_cpu_ptr(percpu);
+	int res = msblk->decompressor->decompress(msblk, stream->stream, buffer,
+		bh, b, offset, length, srclength, pages);
+	put_cpu_ptr(stream);
+
+	if (res < 0)
+		ERROR("%s decompression failed, data probably corrupt\n",
+			msblk->decompressor->name);
+
+	return res;
+}
+
+int squashfs_max_decompressors(void)
+{
+	return num_possible_cpus();
+}
-- 
cgit v1.3


From 846b730e99518a1c9945afcb2afbe4d08a02ed80 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Mon, 18 Nov 2013 02:59:12 +0000
Subject: Squashfs: Generalise paging handling in the decompressors

Further generalise the decompressors by adding a page handler
abstraction.  This adds helpers to allow the decompressors
to access and process the output buffers in an implementation
independant manner.

This allows different types of output buffer to be passed
to the decompressors, with the implementation specific
aspects handled at decompression time, but without the
knowledge being held in the decompressor wrapper code.

This will allow the decompressors to handle Squashfs
cache buffers, and page cache pages.

This patch adds the abstraction and an implementation for
the caches.

Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
Reviewed-by: Minchan Kim <minchan@kernel.org>
---
 fs/squashfs/block.c                     | 27 ++++++++++--------
 fs/squashfs/cache.c                     | 28 +++++++++++++++----
 fs/squashfs/decompressor.c              | 14 ++++++++--
 fs/squashfs/decompressor.h              |  5 ++--
 fs/squashfs/decompressor_multi.c        |  7 ++---
 fs/squashfs/decompressor_multi_percpu.c |  9 +++---
 fs/squashfs/decompressor_single.c       |  9 +++---
 fs/squashfs/lzo_wrapper.c               | 27 ++++++++++++------
 fs/squashfs/page_actor.h                | 49 +++++++++++++++++++++++++++++++++
 fs/squashfs/squashfs.h                  |  8 +++---
 fs/squashfs/squashfs_fs_sb.h            |  1 +
 fs/squashfs/xz_wrapper.c                | 22 +++++++++------
 fs/squashfs/zlib_wrapper.c              | 24 ++++++++++------
 13 files changed, 163 insertions(+), 67 deletions(-)
 create mode 100644 fs/squashfs/page_actor.h

(limited to 'fs')

diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 4dd402597f22..0cea9b9236d0 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -36,6 +36,7 @@
 #include "squashfs_fs_sb.h"
 #include "squashfs.h"
 #include "decompressor.h"
+#include "page_actor.h"
 
 /*
  * Read the metadata block length, this is stored in the first two
@@ -86,16 +87,16 @@ static struct buffer_head *get_block_length(struct super_block *sb,
  * generated a larger block - this does occasionally happen with compression
  * algorithms).
  */
-int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
-			int length, u64 *next_index, int srclength, int pages)
+int squashfs_read_data(struct super_block *sb, u64 index, int length,
+		u64 *next_index, struct squashfs_page_actor *output)
 {
 	struct squashfs_sb_info *msblk = sb->s_fs_info;
 	struct buffer_head **bh;
 	int offset = index & ((1 << msblk->devblksize_log2) - 1);
 	u64 cur_index = index >> msblk->devblksize_log2;
-	int bytes, compressed, b = 0, k = 0, page = 0, avail, i;
+	int bytes, compressed, b = 0, k = 0, avail, i;
 
-	bh = kcalloc(((srclength + msblk->devblksize - 1)
+	bh = kcalloc(((output->length + msblk->devblksize - 1)
 		>> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL);
 	if (bh == NULL)
 		return -ENOMEM;
@@ -111,9 +112,9 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
 			*next_index = index + length;
 
 		TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
-			index, compressed ? "" : "un", length, srclength);
+			index, compressed ? "" : "un", length, output->length);
 
-		if (length < 0 || length > srclength ||
+		if (length < 0 || length > output->length ||
 				(index + length) > msblk->bytes_used)
 			goto read_failure;
 
@@ -145,7 +146,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
 		TRACE("Block @ 0x%llx, %scompressed size %d\n", index,
 				compressed ? "" : "un", length);
 
-		if (length < 0 || length > srclength ||
+		if (length < 0 || length > output->length ||
 					(index + length) > msblk->bytes_used)
 			goto block_release;
 
@@ -165,8 +166,8 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
 	}
 
 	if (compressed) {
-		length = squashfs_decompress(msblk, buffer, bh, b, offset,
-			 length, srclength, pages);
+		length = squashfs_decompress(msblk, bh, b, offset, length,
+			output);
 		if (length < 0)
 			goto read_failure;
 	} else {
@@ -174,19 +175,20 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
 		 * Block is uncompressed.
 		 */
 		int in, pg_offset = 0;
+		void *data = squashfs_first_page(output);
 
 		for (bytes = length; k < b; k++) {
 			in = min(bytes, msblk->devblksize - offset);
 			bytes -= in;
 			while (in) {
 				if (pg_offset == PAGE_CACHE_SIZE) {
-					page++;
+					data = squashfs_next_page(output);
 					pg_offset = 0;
 				}
 				avail = min_t(int, in, PAGE_CACHE_SIZE -
 						pg_offset);
-				memcpy(buffer[page] + pg_offset,
-						bh[k]->b_data + offset, avail);
+				memcpy(data + pg_offset, bh[k]->b_data + offset,
+						avail);
 				in -= avail;
 				pg_offset += avail;
 				offset += avail;
@@ -194,6 +196,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
 			offset = 0;
 			put_bh(bh[k]);
 		}
+		squashfs_finish_page(output);
 	}
 
 	kfree(bh);
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index af0b73802592..1cb70a0b2168 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -56,6 +56,7 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs.h"
+#include "page_actor.h"
 
 /*
  * Look-up block in cache, and increment usage count.  If not in cache, read
@@ -119,9 +120,8 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
 			entry->error = 0;
 			spin_unlock(&cache->lock);
 
-			entry->length = squashfs_read_data(sb, entry->data,
-				block, length, &entry->next_index,
-				cache->block_size, cache->pages);
+			entry->length = squashfs_read_data(sb, block, length,
+				&entry->next_index, entry->actor);
 
 			spin_lock(&cache->lock);
 
@@ -220,6 +220,7 @@ void squashfs_cache_delete(struct squashfs_cache *cache)
 				kfree(cache->entry[i].data[j]);
 			kfree(cache->entry[i].data);
 		}
+		kfree(cache->entry[i].actor);
 	}
 
 	kfree(cache->entry);
@@ -280,6 +281,13 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
 				goto cleanup;
 			}
 		}
+
+		entry->actor = squashfs_page_actor_init(entry->data,
+						cache->pages, 0);
+		if (entry->actor == NULL) {
+			ERROR("Failed to allocate %s cache entry\n", name);
+			goto cleanup;
+		}
 	}
 
 	return cache;
@@ -410,6 +418,7 @@ void *squashfs_read_table(struct super_block *sb, u64 block, int length)
 	int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	int i, res;
 	void *table, *buffer, **data;
+	struct squashfs_page_actor *actor;
 
 	table = buffer = kmalloc(length, GFP_KERNEL);
 	if (table == NULL)
@@ -421,19 +430,28 @@ void *squashfs_read_table(struct super_block *sb, u64 block, int length)
 		goto failed;
 	}
 
+	actor = squashfs_page_actor_init(data, pages, length);
+	if (actor == NULL) {
+		res = -ENOMEM;
+		goto failed2;
+	}
+
 	for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
 		data[i] = buffer;
 
-	res = squashfs_read_data(sb, data, block, length |
-		SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages);
+	res = squashfs_read_data(sb, block, length |
+		SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, actor);
 
 	kfree(data);
+	kfree(actor);
 
 	if (res < 0)
 		goto failed;
 
 	return table;
 
+failed2:
+	kfree(data);
 failed:
 	kfree(table);
 	return ERR_PTR(res);
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 234291f79ba5..ac22fe73b0ad 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -30,6 +30,7 @@
 #include "squashfs_fs_sb.h"
 #include "decompressor.h"
 #include "squashfs.h"
+#include "page_actor.h"
 
 /*
  * This file (and decompressor.h) implements a decompressor framework for
@@ -87,6 +88,7 @@ static void *get_comp_opts(struct super_block *sb, unsigned short flags)
 {
 	struct squashfs_sb_info *msblk = sb->s_fs_info;
 	void *buffer = NULL, *comp_opts;
+	struct squashfs_page_actor *actor = NULL;
 	int length = 0;
 
 	/*
@@ -99,9 +101,14 @@ static void *get_comp_opts(struct super_block *sb, unsigned short flags)
 			goto out;
 		}
 
-		length = squashfs_read_data(sb, &buffer,
-			sizeof(struct squashfs_super_block), 0, NULL,
-				PAGE_CACHE_SIZE, 1);
+		actor = squashfs_page_actor_init(&buffer, 1, 0);
+		if (actor == NULL) {
+			comp_opts = ERR_PTR(-ENOMEM);
+			goto out;
+		}
+
+		length = squashfs_read_data(sb,
+			sizeof(struct squashfs_super_block), 0, NULL, actor);
 
 		if (length < 0) {
 			comp_opts = ERR_PTR(length);
@@ -112,6 +119,7 @@ static void *get_comp_opts(struct super_block *sb, unsigned short flags)
 	comp_opts = squashfs_comp_opts(msblk, buffer, length);
 
 out:
+	kfree(actor);
 	kfree(buffer);
 	return comp_opts;
 }
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 6cdb20a3878a..af0985321808 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -27,8 +27,9 @@ struct squashfs_decompressor {
 	void	*(*init)(struct squashfs_sb_info *, void *);
 	void	*(*comp_opts)(struct squashfs_sb_info *, void *, int);
 	void	(*free)(void *);
-	int	(*decompress)(struct squashfs_sb_info *, void *, void **,
-		struct buffer_head **, int, int, int, int, int);
+	int	(*decompress)(struct squashfs_sb_info *, void *,
+		struct buffer_head **, int, int, int,
+		struct squashfs_page_actor *);
 	int	id;
 	char	*name;
 	int	supported;
diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c
index 462731db5130..ae54675a3526 100644
--- a/fs/squashfs/decompressor_multi.c
+++ b/fs/squashfs/decompressor_multi.c
@@ -183,15 +183,14 @@ wait:
 }
 
 
-int squashfs_decompress(struct squashfs_sb_info *msblk,
-	void **buffer, struct buffer_head **bh, int b, int offset, int length,
-	int srclength, int pages)
+int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
+	int b, int offset, int length, struct squashfs_page_actor *output)
 {
 	int res;
 	struct squashfs_stream *stream = msblk->stream;
 	struct decomp_stream *decomp_stream = get_decomp_stream(msblk, stream);
 	res = msblk->decompressor->decompress(msblk, decomp_stream->stream,
-		buffer, bh, b, offset, length, srclength, pages);
+		bh, b, offset, length, output);
 	put_decomp_stream(decomp_stream, stream);
 	if (res < 0)
 		ERROR("%s decompression failed, data probably corrupt\n",
diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c
index 0e7b679bc4ad..23a9c28ad8ea 100644
--- a/fs/squashfs/decompressor_multi_percpu.c
+++ b/fs/squashfs/decompressor_multi_percpu.c
@@ -74,15 +74,14 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
 	}
 }
 
-int squashfs_decompress(struct squashfs_sb_info *msblk,
-	void **buffer, struct buffer_head **bh, int b, int offset, int length,
-	int srclength, int pages)
+int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
+	int b, int offset, int length, struct squashfs_page_actor *output)
 {
 	struct squashfs_stream __percpu *percpu =
 			(struct squashfs_stream __percpu *) msblk->stream;
 	struct squashfs_stream *stream = get_cpu_ptr(percpu);
-	int res = msblk->decompressor->decompress(msblk, stream->stream, buffer,
-		bh, b, offset, length, srclength, pages);
+	int res = msblk->decompressor->decompress(msblk, stream->stream, bh, b,
+		offset, length, output);
 	put_cpu_ptr(stream);
 
 	if (res < 0)
diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c
index f857cf6f22d4..a6c75929a00e 100644
--- a/fs/squashfs/decompressor_single.c
+++ b/fs/squashfs/decompressor_single.c
@@ -61,16 +61,15 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
 	}
 }
 
-int squashfs_decompress(struct squashfs_sb_info *msblk,
-	void **buffer, struct buffer_head **bh, int b, int offset, int length,
-	int srclength, int pages)
+int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
+	int b, int offset, int length, struct squashfs_page_actor *output)
 {
 	int res;
 	struct squashfs_stream *stream = msblk->stream;
 
 	mutex_lock(&stream->mutex);
-	res = msblk->decompressor->decompress(msblk, stream->stream, buffer,
-		bh, b, offset, length, srclength, pages);
+	res = msblk->decompressor->decompress(msblk, stream->stream, bh, b,
+		offset, length, output);
 	mutex_unlock(&stream->mutex);
 
 	if (res < 0)
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 75c3b5779172..244b9fbfff7b 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -31,6 +31,7 @@
 #include "squashfs_fs_sb.h"
 #include "squashfs.h"
 #include "decompressor.h"
+#include "page_actor.h"
 
 struct squashfs_lzo {
 	void	*input;
@@ -75,13 +76,13 @@ static void lzo_free(void *strm)
 
 
 static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
-	void **buffer, struct buffer_head **bh, int b, int offset, int length,
-	int srclength, int pages)
+	struct buffer_head **bh, int b, int offset, int length,
+	struct squashfs_page_actor *output)
 {
 	struct squashfs_lzo *stream = strm;
-	void *buff = stream->input;
+	void *buff = stream->input, *data;
 	int avail, i, bytes = length, res;
-	size_t out_len = srclength;
+	size_t out_len = output->length;
 
 	for (i = 0; i < b; i++) {
 		avail = min(bytes, msblk->devblksize - offset);
@@ -98,12 +99,20 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
 		goto failed;
 
 	res = bytes = (int)out_len;
-	for (i = 0, buff = stream->output; bytes && i < pages; i++) {
-		avail = min_t(int, bytes, PAGE_CACHE_SIZE);
-		memcpy(buffer[i], buff, avail);
-		buff += avail;
-		bytes -= avail;
+	data = squashfs_first_page(output);
+	buff = stream->output;
+	while (data) {
+		if (bytes <= PAGE_CACHE_SIZE) {
+			memcpy(data, buff, bytes);
+			break;
+		} else {
+			memcpy(data, buff, PAGE_CACHE_SIZE);
+			buff += PAGE_CACHE_SIZE;
+			bytes -= PAGE_CACHE_SIZE;
+			data = squashfs_next_page(output);
+		}
 	}
+	squashfs_finish_page(output);
 
 	return res;
 
diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
new file mode 100644
index 000000000000..5b0ba5a7133a
--- /dev/null
+++ b/fs/squashfs/page_actor.h
@@ -0,0 +1,49 @@
+#ifndef PAGE_ACTOR_H
+#define PAGE_ACTOR_H
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+struct squashfs_page_actor {
+	void	**page;
+	int	pages;
+	int	length;
+	int	next_page;
+};
+
+static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page,
+	int pages, int length)
+{
+	struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
+
+	if (actor == NULL)
+		return NULL;
+
+	actor->length = length ? : pages * PAGE_CACHE_SIZE;
+	actor->page = page;
+	actor->pages = pages;
+	actor->next_page = 0;
+	return actor;
+}
+
+static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
+{
+	actor->next_page = 1;
+	return actor->page[0];
+}
+
+static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
+{
+	return actor->next_page == actor->pages ? NULL :
+		actor->page[actor->next_page++];
+}
+
+static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
+{
+	/* empty */
+}
+#endif
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 2e2751df8452..6a97e63ca173 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -28,8 +28,8 @@
 #define WARNING(s, args...)	pr_warning("SQUASHFS: "s, ## args)
 
 /* block.c */
-extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
-				int, int);
+extern int squashfs_read_data(struct super_block *, u64, int, u64 *,
+				struct squashfs_page_actor *);
 
 /* cache.c */
 extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
@@ -53,8 +53,8 @@ extern void *squashfs_decompressor_setup(struct super_block *, unsigned short);
 /* decompressor_xxx.c */
 extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *);
 extern void squashfs_decompressor_destroy(struct squashfs_sb_info *);
-extern int squashfs_decompress(struct squashfs_sb_info *, void **,
-	struct buffer_head **, int, int, int, int, int);
+extern int squashfs_decompress(struct squashfs_sb_info *, struct buffer_head **,
+	int, int, int, struct squashfs_page_actor *);
 extern int squashfs_max_decompressors(void);
 
 /* export.c */
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 9cdcf4150d59..1da565cb50c3 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -50,6 +50,7 @@ struct squashfs_cache_entry {
 	wait_queue_head_t	wait_queue;
 	struct squashfs_cache	*cache;
 	void			**data;
+	struct squashfs_page_actor	*actor;
 };
 
 struct squashfs_sb_info {
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index 5d1d07cca6b4..c609624e4b8a 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -32,6 +32,7 @@
 #include "squashfs_fs_sb.h"
 #include "squashfs.h"
 #include "decompressor.h"
+#include "page_actor.h"
 
 struct squashfs_xz {
 	struct xz_dec *state;
@@ -129,11 +130,11 @@ static void squashfs_xz_free(void *strm)
 
 
 static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
-	void **buffer, struct buffer_head **bh, int b, int offset, int length,
-	int srclength, int pages)
+	struct buffer_head **bh, int b, int offset, int length,
+	struct squashfs_page_actor *output)
 {
 	enum xz_ret xz_err;
-	int avail, total = 0, k = 0, page = 0;
+	int avail, total = 0, k = 0;
 	struct squashfs_xz *stream = strm;
 
 	xz_dec_reset(stream->state);
@@ -141,7 +142,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	stream->buf.in_size = 0;
 	stream->buf.out_pos = 0;
 	stream->buf.out_size = PAGE_CACHE_SIZE;
-	stream->buf.out = buffer[page++];
+	stream->buf.out = squashfs_first_page(output);
 
 	do {
 		if (stream->buf.in_pos == stream->buf.in_size && k < b) {
@@ -153,11 +154,12 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
 			offset = 0;
 		}
 
-		if (stream->buf.out_pos == stream->buf.out_size
-							&& page < pages) {
-			stream->buf.out = buffer[page++];
-			stream->buf.out_pos = 0;
-			total += PAGE_CACHE_SIZE;
+		if (stream->buf.out_pos == stream->buf.out_size) {
+			stream->buf.out = squashfs_next_page(output);
+			if (stream->buf.out != NULL) {
+				stream->buf.out_pos = 0;
+				total += PAGE_CACHE_SIZE;
+			}
 		}
 
 		xz_err = xz_dec_run(stream->state, &stream->buf);
@@ -166,6 +168,8 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
 			put_bh(bh[k++]);
 	} while (xz_err == XZ_OK);
 
+	squashfs_finish_page(output);
+
 	if (xz_err != XZ_STREAM_END || k < b)
 		goto out;
 
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index bb049027d15c..8727caba6882 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -32,6 +32,7 @@
 #include "squashfs_fs_sb.h"
 #include "squashfs.h"
 #include "decompressor.h"
+#include "page_actor.h"
 
 static void *zlib_init(struct squashfs_sb_info *dummy, void *buff)
 {
@@ -62,14 +63,14 @@ static void zlib_free(void *strm)
 
 
 static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
-	void **buffer, struct buffer_head **bh, int b, int offset, int length,
-	int srclength, int pages)
+	struct buffer_head **bh, int b, int offset, int length,
+	struct squashfs_page_actor *output)
 {
-	int zlib_err, zlib_init = 0;
-	int k = 0, page = 0;
+	int zlib_err, zlib_init = 0, k = 0;
 	z_stream *stream = strm;
 
-	stream->avail_out = 0;
+	stream->avail_out = PAGE_CACHE_SIZE;
+	stream->next_out = squashfs_first_page(output);
 	stream->avail_in = 0;
 
 	do {
@@ -81,15 +82,18 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
 			offset = 0;
 		}
 
-		if (stream->avail_out == 0 && page < pages) {
-			stream->next_out = buffer[page++];
-			stream->avail_out = PAGE_CACHE_SIZE;
+		if (stream->avail_out == 0) {
+			stream->next_out = squashfs_next_page(output);
+			if (stream->next_out != NULL)
+				stream->avail_out = PAGE_CACHE_SIZE;
 		}
 
 		if (!zlib_init) {
 			zlib_err = zlib_inflateInit(stream);
-			if (zlib_err != Z_OK)
+			if (zlib_err != Z_OK) {
+				squashfs_finish_page(output);
 				goto out;
+			}
 			zlib_init = 1;
 		}
 
@@ -99,6 +103,8 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
 			put_bh(bh[k++]);
 	} while (zlib_err == Z_OK);
 
+	squashfs_finish_page(output);
+
 	if (zlib_err != Z_STREAM_END)
 		goto out;
 
-- 
cgit v1.3


From 5f55dbc0c5c466a9cdfa4da7ac1bfe351c7fc52a Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Thu, 31 Oct 2013 19:24:27 +0000
Subject: Squashfs: Restructure squashfs_readpage()

Restructure squashfs_readpage() splitting it into separate
functions for datablocks, fragments and sparse blocks.

Move the memcpying (from squashfs cache entry) implementation of
squashfs_readpage_block into file_cache.c

This allows different implementations to be supported.

Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
Reviewed-by: Minchan Kim <minchan@kernel.org>
---
 fs/squashfs/Makefile     |   2 +-
 fs/squashfs/file.c       | 142 ++++++++++++++++++++++++-----------------------
 fs/squashfs/file_cache.c |  38 +++++++++++++
 fs/squashfs/squashfs.h   |   7 +++
 4 files changed, 118 insertions(+), 71 deletions(-)
 create mode 100644 fs/squashfs/file_cache.c

(limited to 'fs')

diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 5833b96ee69c..e01ba1126c89 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
-squashfs-y += namei.o super.o symlink.o decompressor.o
+squashfs-y += namei.o super.o symlink.o decompressor.o file_cache.o
 squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o
 squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o
 squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 8ca62c28fe12..e5c9689062ba 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -370,77 +370,15 @@ static int read_blocklist(struct inode *inode, int index, u64 *block)
 	return le32_to_cpu(size);
 }
 
-
-static int squashfs_readpage(struct file *file, struct page *page)
+/* Copy data into page cache  */
+void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer,
+	int bytes, int offset)
 {
 	struct inode *inode = page->mapping->host;
 	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
-	int bytes, i, offset = 0, sparse = 0;
-	struct squashfs_cache_entry *buffer = NULL;
 	void *pageaddr;
-
-	int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
-	int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
-	int start_index = page->index & ~mask;
-	int end_index = start_index | mask;
-	int file_end = i_size_read(inode) >> msblk->block_log;
-
-	TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
-				page->index, squashfs_i(inode)->start);
-
-	if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
-					PAGE_CACHE_SHIFT))
-		goto out;
-
-	if (index < file_end || squashfs_i(inode)->fragment_block ==
-					SQUASHFS_INVALID_BLK) {
-		/*
-		 * Reading a datablock from disk.  Need to read block list
-		 * to get location and block size.
-		 */
-		u64 block = 0;
-		int bsize = read_blocklist(inode, index, &block);
-		if (bsize < 0)
-			goto error_out;
-
-		if (bsize == 0) { /* hole */
-			bytes = index == file_end ?
-				(i_size_read(inode) & (msblk->block_size - 1)) :
-				 msblk->block_size;
-			sparse = 1;
-		} else {
-			/*
-			 * Read and decompress datablock.
-			 */
-			buffer = squashfs_get_datablock(inode->i_sb,
-								block, bsize);
-			if (buffer->error) {
-				ERROR("Unable to read page, block %llx, size %x"
-					"\n", block, bsize);
-				squashfs_cache_put(buffer);
-				goto error_out;
-			}
-			bytes = buffer->length;
-		}
-	} else {
-		/*
-		 * Datablock is stored inside a fragment (tail-end packed
-		 * block).
-		 */
-		buffer = squashfs_get_fragment(inode->i_sb,
-				squashfs_i(inode)->fragment_block,
-				squashfs_i(inode)->fragment_size);
-
-		if (buffer->error) {
-			ERROR("Unable to read page, block %llx, size %x\n",
-				squashfs_i(inode)->fragment_block,
-				squashfs_i(inode)->fragment_size);
-			squashfs_cache_put(buffer);
-			goto error_out;
-		}
-		bytes = i_size_read(inode) & (msblk->block_size - 1);
-		offset = squashfs_i(inode)->fragment_offset;
-	}
+	int i, mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
+	int start_index = page->index & ~mask, end_index = start_index | mask;
 
 	/*
 	 * Loop copying datablock into pages.  As the datablock likely covers
@@ -451,7 +389,7 @@ static int squashfs_readpage(struct file *file, struct page *page)
 	for (i = start_index; i <= end_index && bytes > 0; i++,
 			bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
 		struct page *push_page;
-		int avail = sparse ? 0 : min_t(int, bytes, PAGE_CACHE_SIZE);
+		int avail = buffer ? min_t(int, bytes, PAGE_CACHE_SIZE) : 0;
 
 		TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
 
@@ -475,11 +413,75 @@ skip_page:
 		if (i != page->index)
 			page_cache_release(push_page);
 	}
+}
+
+/* Read datablock stored packed inside a fragment (tail-end packed block) */
+static int squashfs_readpage_fragment(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb,
+		squashfs_i(inode)->fragment_block,
+		squashfs_i(inode)->fragment_size);
+	int res = buffer->error;
+
+	if (res)
+		ERROR("Unable to read page, block %llx, size %x\n",
+			squashfs_i(inode)->fragment_block,
+			squashfs_i(inode)->fragment_size);
+	else
+		squashfs_copy_cache(page, buffer, i_size_read(inode) &
+			(msblk->block_size - 1),
+			squashfs_i(inode)->fragment_offset);
+
+	squashfs_cache_put(buffer);
+	return res;
+}
 
-	if (!sparse)
-		squashfs_cache_put(buffer);
+static int squashfs_readpage_sparse(struct page *page, int index, int file_end)
+{
+	struct inode *inode = page->mapping->host;
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	int bytes = index == file_end ?
+			(i_size_read(inode) & (msblk->block_size - 1)) :
+			 msblk->block_size;
 
+	squashfs_copy_cache(page, NULL, bytes, 0);
 	return 0;
+}
+
+static int squashfs_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
+	int file_end = i_size_read(inode) >> msblk->block_log;
+	int res;
+	void *pageaddr;
+
+	TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
+				page->index, squashfs_i(inode)->start);
+
+	if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+					PAGE_CACHE_SHIFT))
+		goto out;
+
+	if (index < file_end || squashfs_i(inode)->fragment_block ==
+					SQUASHFS_INVALID_BLK) {
+		u64 block = 0;
+		int bsize = read_blocklist(inode, index, &block);
+		if (bsize < 0)
+			goto error_out;
+
+		if (bsize == 0)
+			res = squashfs_readpage_sparse(page, index, file_end);
+		else
+			res = squashfs_readpage_block(page, block, bsize);
+	} else
+		res = squashfs_readpage_fragment(page);
+
+	if (!res)
+		return 0;
 
 error_out:
 	SetPageError(page);
diff --git a/fs/squashfs/file_cache.c b/fs/squashfs/file_cache.c
new file mode 100644
index 000000000000..f2310d2a2019
--- /dev/null
+++ b/fs/squashfs/file_cache.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/mutex.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/* Read separately compressed datablock and memcopy into page cache */
+int squashfs_readpage_block(struct page *page, u64 block, int bsize)
+{
+	struct inode *i = page->mapping->host;
+	struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
+		block, bsize);
+	int res = buffer->error;
+
+	if (res)
+		ERROR("Unable to read page, block %llx, size %x\n", block,
+			bsize);
+	else
+		squashfs_copy_cache(page, buffer, buffer->length, 0);
+
+	squashfs_cache_put(buffer);
+	return res;
+}
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 6a97e63ca173..9e1bb79f7e6f 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -66,6 +66,13 @@ extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *);
 extern __le64 *squashfs_read_fragment_index_table(struct super_block *,
 				u64, u64, unsigned int);
 
+/* file.c */
+void squashfs_copy_cache(struct page *, struct squashfs_cache_entry *, int,
+				int);
+
+/* file_xxx.c */
+extern int squashfs_readpage_block(struct page *, u64, int);
+
 /* id.c */
 extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *);
 extern __le64 *squashfs_read_id_index_table(struct super_block *, u64, u64,
-- 
cgit v1.3


From 0d455c12c6428647547bacccaaced3cae0f35570 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Wed, 13 Nov 2013 02:04:19 +0000
Subject: Squashfs: Directly decompress into the page cache for file data

This introduces an implementation of squashfs_readpage_block()
that directly decompresses into the page cache.

This uses the previously added page handler abstraction to push
down the necessary kmap_atomic/kunmap_atomic operations on the
page cache buffers into the decompressors.  This enables
direct copying into the page cache without using the slow
kmap/kunmap calls.

The code detects when multiple threads are racing in
squashfs_readpage() to decompress the same block, and avoids
this regression by falling back to using an intermediate
buffer.

This patch enhances the performance of Squashfs significantly
when multiple processes are accessing the filesystem simultaneously
because it not only reduces memcopying, but it more importantly
eliminates the lock contention on the intermediate buffer.

Using single-thread decompression.

        dd if=file1 of=/dev/null bs=4096 &
        dd if=file2 of=/dev/null bs=4096 &
        dd if=file3 of=/dev/null bs=4096 &
        dd if=file4 of=/dev/null bs=4096

Before:

629145600 bytes (629 MB) copied, 45.8046 s, 13.7 MB/s

After:

629145600 bytes (629 MB) copied, 9.29414 s, 67.7 MB/s

Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
Reviewed-by: Minchan Kim <minchan@kernel.org>
---
 fs/squashfs/Kconfig       |  28 ++++++++
 fs/squashfs/Makefile      |   4 +-
 fs/squashfs/file_direct.c | 173 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/squashfs/page_actor.c  | 100 +++++++++++++++++++++++++++
 fs/squashfs/page_actor.h  |  32 +++++++++
 5 files changed, 336 insertions(+), 1 deletion(-)
 create mode 100644 fs/squashfs/file_direct.c
 create mode 100644 fs/squashfs/page_actor.c

(limited to 'fs')

diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 159bd6676dc2..b6fa8657dcbc 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -25,6 +25,34 @@ config SQUASHFS
 
 	  If unsure, say N.
 
+choice
+	prompt "File decompression options"
+	depends on SQUASHFS
+	help
+	  Squashfs now supports two options for decompressing file
+	  data.  Traditionally Squashfs has decompressed into an
+	  intermediate buffer and then memcopied it into the page cache.
+	  Squashfs now supports the ability to decompress directly into
+	  the page cache.
+
+	  If unsure, select "Decompress file data into an intermediate buffer"
+
+config SQUASHFS_FILE_CACHE
+	bool "Decompress file data into an intermediate buffer"
+	help
+	  Decompress file data into an intermediate buffer and then
+	  memcopy it into the page cache.
+
+config SQUASHFS_FILE_DIRECT
+	bool "Decompress files directly into the page cache"
+	help
+	  Directly decompress file data into the page cache.
+	  Doing so can significantly improve performance because
+	  it eliminates a memcpy and it also removes the lock contention
+	  on the single buffer.
+
+endchoice
+
 choice
 	prompt "Decompressor parallelisation options"
 	depends on SQUASHFS
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index e01ba1126c89..4132520b4ff2 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -4,7 +4,9 @@
 
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
-squashfs-y += namei.o super.o symlink.o decompressor.o file_cache.o
+squashfs-y += namei.o super.o symlink.o decompressor.o
+squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o
+squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o
 squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o
 squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o
 squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
new file mode 100644
index 000000000000..2943b2bfae48
--- /dev/null
+++ b/fs/squashfs/file_direct.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/mutex.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "page_actor.h"
+
+static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
+	int pages, struct page **page);
+
+/* Read separately compressed datablock directly into page cache */
+int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
+
+{
+	struct inode *inode = target_page->mapping->host;
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+
+	int file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+	int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
+	int start_index = target_page->index & ~mask;
+	int end_index = start_index | mask;
+	int i, n, pages, missing_pages, bytes, res = -ENOMEM;
+	struct page **page;
+	struct squashfs_page_actor *actor;
+	void *pageaddr;
+
+	if (end_index > file_end)
+		end_index = file_end;
+
+	pages = end_index - start_index + 1;
+
+	page = kmalloc(sizeof(void *) * pages, GFP_KERNEL);
+	if (page == NULL)
+		return res;
+
+	/*
+	 * Create a "page actor" which will kmap and kunmap the
+	 * page cache pages appropriately within the decompressor
+	 */
+	actor = squashfs_page_actor_init_special(page, pages, 0);
+	if (actor == NULL)
+		goto out;
+
+	/* Try to grab all the pages covered by the Squashfs block */
+	for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) {
+		page[i] = (n == target_page->index) ? target_page :
+			grab_cache_page_nowait(target_page->mapping, n);
+
+		if (page[i] == NULL) {
+			missing_pages++;
+			continue;
+		}
+
+		if (PageUptodate(page[i])) {
+			unlock_page(page[i]);
+			page_cache_release(page[i]);
+			page[i] = NULL;
+			missing_pages++;
+		}
+	}
+
+	if (missing_pages) {
+		/*
+		 * Couldn't get one or more pages, this page has either
+		 * been VM reclaimed, but others are still in the page cache
+		 * and uptodate, or we're racing with another thread in
+		 * squashfs_readpage also trying to grab them.  Fall back to
+		 * using an intermediate buffer.
+		 */
+		res = squashfs_read_cache(target_page, block, bsize, pages,
+								page);
+		goto out;
+	}
+
+	/* Decompress directly into the page cache buffers */
+	res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor);
+	if (res < 0)
+		goto mark_errored;
+
+	/* Last page may have trailing bytes not filled */
+	bytes = res % PAGE_CACHE_SIZE;
+	if (bytes) {
+		pageaddr = kmap_atomic(page[pages - 1]);
+		memset(pageaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
+		kunmap_atomic(pageaddr);
+	}
+
+	/* Mark pages as uptodate, unlock and release */
+	for (i = 0; i < pages; i++) {
+		flush_dcache_page(page[i]);
+		SetPageUptodate(page[i]);
+		unlock_page(page[i]);
+		if (page[i] != target_page)
+			page_cache_release(page[i]);
+	}
+
+	kfree(actor);
+	kfree(page);
+
+	return 0;
+
+mark_errored:
+	/* Decompression failed, mark pages as errored.  Target_page is
+	 * dealt with by the caller
+	 */
+	for (i = 0; i < pages; i++) {
+		if (page[i] == target_page)
+			continue;
+		flush_dcache_page(page[i]);
+		SetPageError(page[i]);
+		unlock_page(page[i]);
+		page_cache_release(page[i]);
+	}
+
+out:
+	kfree(actor);
+	kfree(page);
+	return res;
+}
+
+
+static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
+	int pages, struct page **page)
+{
+	struct inode *i = target_page->mapping->host;
+	struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
+						 block, bsize);
+	int bytes = buffer->length, res = buffer->error, n, offset = 0;
+	void *pageaddr;
+
+	if (res) {
+		ERROR("Unable to read page, block %llx, size %x\n", block,
+			bsize);
+		goto out;
+	}
+
+	for (n = 0; n < pages && bytes > 0; n++,
+			bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
+		int avail = min_t(int, bytes, PAGE_CACHE_SIZE);
+
+		if (page[n] == NULL)
+			continue;
+
+		pageaddr = kmap_atomic(page[n]);
+		squashfs_copy_data(pageaddr, buffer, offset, avail);
+		memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
+		kunmap_atomic(pageaddr);
+		flush_dcache_page(page[n]);
+		SetPageUptodate(page[n]);
+		unlock_page(page[n]);
+		if (page[n] != target_page)
+			page_cache_release(page[n]);
+	}
+
+out:
+	squashfs_cache_put(buffer);
+	return res;
+}
diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c
new file mode 100644
index 000000000000..5a1c11f56441
--- /dev/null
+++ b/fs/squashfs/page_actor.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include "page_actor.h"
+
+/*
+ * This file contains implementations of page_actor for decompressing into
+ * an intermediate buffer, and for decompressing directly into the
+ * page cache.
+ *
+ * Calling code should avoid sleeping between calls to squashfs_first_page()
+ * and squashfs_finish_page().
+ */
+
+/* Implementation of page_actor for decompressing into intermediate buffer */
+static void *cache_first_page(struct squashfs_page_actor *actor)
+{
+	actor->next_page = 1;
+	return actor->buffer[0];
+}
+
+static void *cache_next_page(struct squashfs_page_actor *actor)
+{
+	if (actor->next_page == actor->pages)
+		return NULL;
+
+	return actor->buffer[actor->next_page++];
+}
+
+static void cache_finish_page(struct squashfs_page_actor *actor)
+{
+	/* empty */
+}
+
+struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
+	int pages, int length)
+{
+	struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
+
+	if (actor == NULL)
+		return NULL;
+
+	actor->length = length ? : pages * PAGE_CACHE_SIZE;
+	actor->buffer = buffer;
+	actor->pages = pages;
+	actor->next_page = 0;
+	actor->squashfs_first_page = cache_first_page;
+	actor->squashfs_next_page = cache_next_page;
+	actor->squashfs_finish_page = cache_finish_page;
+	return actor;
+}
+
+/* Implementation of page_actor for decompressing directly into page cache. */
+static void *direct_first_page(struct squashfs_page_actor *actor)
+{
+	actor->next_page = 1;
+	return actor->pageaddr = kmap_atomic(actor->page[0]);
+}
+
+static void *direct_next_page(struct squashfs_page_actor *actor)
+{
+	if (actor->pageaddr)
+		kunmap_atomic(actor->pageaddr);
+
+	return actor->pageaddr = actor->next_page == actor->pages ? NULL :
+		kmap_atomic(actor->page[actor->next_page++]);
+}
+
+static void direct_finish_page(struct squashfs_page_actor *actor)
+{
+	if (actor->pageaddr)
+		kunmap_atomic(actor->pageaddr);
+}
+
+struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
+	int pages, int length)
+{
+	struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
+
+	if (actor == NULL)
+		return NULL;
+
+	actor->length = length ? : pages * PAGE_CACHE_SIZE;
+	actor->page = page;
+	actor->pages = pages;
+	actor->next_page = 0;
+	actor->pageaddr = NULL;
+	actor->squashfs_first_page = direct_first_page;
+	actor->squashfs_next_page = direct_next_page;
+	actor->squashfs_finish_page = direct_finish_page;
+	return actor;
+}
diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
index 5b0ba5a7133a..26dd82008b82 100644
--- a/fs/squashfs/page_actor.h
+++ b/fs/squashfs/page_actor.h
@@ -8,6 +8,7 @@
  * the COPYING file in the top-level directory.
  */
 
+#ifndef CONFIG_SQUASHFS_FILE_DIRECT
 struct squashfs_page_actor {
 	void	**page;
 	int	pages;
@@ -46,4 +47,35 @@ static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
 {
 	/* empty */
 }
+#else
+struct squashfs_page_actor {
+	union {
+		void		**buffer;
+		struct page	**page;
+	};
+	void	*pageaddr;
+	void    *(*squashfs_first_page)(struct squashfs_page_actor *);
+	void    *(*squashfs_next_page)(struct squashfs_page_actor *);
+	void    (*squashfs_finish_page)(struct squashfs_page_actor *);
+	int	pages;
+	int	length;
+	int	next_page;
+};
+
+extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int);
+extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page
+							 **, int, int);
+static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
+{
+	return actor->squashfs_first_page(actor);
+}
+static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
+{
+	return actor->squashfs_next_page(actor);
+}
+static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
+{
+	actor->squashfs_finish_page(actor);
+}
+#endif
 #endif
-- 
cgit v1.3


From ed4f381ec15e5f11724cdbc68cffd2c22d1eaebd Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Sun, 10 Nov 2013 00:02:29 +0000
Subject: Squashfs: Check stream is not NULL in decompressor_multi.c

Fix static checker complaint that stream is not checked in
squashfs_decompressor_destroy().

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
Reviewed-by: Minchan Kim <minchan@kernel.org>
---
 fs/squashfs/decompressor_multi.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c
index ae54675a3526..d6008a636479 100644
--- a/fs/squashfs/decompressor_multi.c
+++ b/fs/squashfs/decompressor_multi.c
@@ -119,11 +119,10 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
 			kfree(decomp_strm);
 			stream->avail_decomp--;
 		}
+		WARN_ON(stream->avail_decomp);
+		kfree(stream->comp_opts);
+		kfree(stream);
 	}
-
-	WARN_ON(stream->avail_decomp);
-	kfree(stream->comp_opts);
-	kfree(stream);
 }
 
 
-- 
cgit v1.3


From 4724b106b9b8e8b802ca6f6d8a2f74feb8a3c375 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Tue, 5 Nov 2013 11:11:40 -0500
Subject: Btrfs: don't BUG_ON() if we get an error walking backrefs

We can just return false for this so we stop doing the snapshot aware defrag
stuff.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index da8d2f696ac5..fd67b34e220d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2129,7 +2129,8 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path,
 						  old->extent_offset, fs_info,
 						  path, record_one_backref,
 						  old);
-		BUG_ON(ret < 0 && ret != -ENOENT);
+		if (ret < 0 && ret != -ENOENT)
+			return false;
 
 		/* no backref to be processed for this extent */
 		if (!old->count) {
-- 
cgit v1.3


From 6cfab851f42edcfa72763b3cb8dc26b8fd74e203 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Tue, 12 Nov 2013 16:25:58 -0500
Subject: Btrfs: make sure to copy everything if we rename

If we rename a file that is already in the log and we fsync again we will lose
the new name.  This is because we just log the inode update and not the new ref.
To fix this we just need to check if we are logging the new name of the inode
and copy all the metadata instead of just updating the inode itself.  With this
patch my testcase now passes.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tree-log.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 744553c83fe2..b0c7134162fb 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3697,7 +3697,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 			ret = btrfs_truncate_inode_items(trans, log,
 							 inode, 0, 0);
 		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
-					      &BTRFS_I(inode)->runtime_flags)) {
+					      &BTRFS_I(inode)->runtime_flags) ||
+			   inode_only == LOG_INODE_EXISTS) {
 			if (inode_only == LOG_INODE_ALL)
 				fast_search = true;
 			max_key.type = BTRFS_XATTR_ITEM_KEY;
-- 
cgit v1.3


From d006a04816a306995cd9d394079652a7462f81f8 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Tue, 12 Nov 2013 20:54:09 -0500
Subject: Btrfs: only drop modified extents if we logged the whole inode

If we fsync, seek and write, rename and then fsync again we will lose the
modified hole extent because the rename will drop all of the modified extents
since we didn't do the fast search.  We need to only drop the modified extents
if we didn't do the fast search and we were logging the entire inode as we don't
need them anymore, otherwise this is being premature.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tree-log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index b0c7134162fb..9f7fc51ca334 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3802,7 +3802,7 @@ log_extents:
 			err = ret;
 			goto out_unlock;
 		}
-	} else {
+	} else if (inode_only == LOG_INODE_ALL) {
 		struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
 		struct extent_map *em, *n;
 
-- 
cgit v1.3


From 33ef30add16905f99bbe799ee5ccea15ba497803 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Sun, 3 Nov 2013 19:06:38 +0200
Subject: Btrfs: do not inc uncorrectable_errors counter on ro scrubs

Currently if we discover an error when scrubbing in ro mode we a)
blindly increment the uncorrectable_errors counter, and b) spam the
dmesg with the 'unable to fixup (regular) error at ...' message, even
though a) we haven't tried to determine if the error is correctable or
not, and b) we haven't tried to fixup anything.  Fix this.

Cc: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/scrub.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 2544805544f0..561e2f16ba3e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -938,8 +938,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 				BTRFS_DEV_STAT_CORRUPTION_ERRS);
 	}
 
-	if (sctx->readonly && !sctx->is_dev_replace)
-		goto did_not_correct_error;
+	if (sctx->readonly) {
+		ASSERT(!sctx->is_dev_replace);
+		goto out;
+	}
 
 	if (!is_metadata && !have_csum) {
 		struct scrub_fixup_nodatasum *fixup_nodatasum;
-- 
cgit v1.3


From 908960c6c0fb3b3ce3971dc0ca47b581d256b968 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Sun, 3 Nov 2013 19:06:39 +0200
Subject: Btrfs: disable online raid-repair on ro mounts

This disables the "if needed, write the good copy back before the read
is completed" part of the read sequence for read-only mounts.

Cc: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent_io.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 856bc2b2192c..8e457fca0a0b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1980,6 +1980,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 	int ret;
 
+	ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
 	BUG_ON(!mirror_num);
 
 	/* we can't repair anything in raid56 yet */
@@ -2036,6 +2037,9 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
 	unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
 	int ret = 0;
 
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
 	for (i = 0; i < num_pages; i++) {
 		struct page *p = extent_buffer_page(eb, i);
 		ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
@@ -2057,12 +2061,12 @@ static int clean_io_failure(u64 start, struct page *page)
 	u64 private;
 	u64 private_failure;
 	struct io_failure_record *failrec;
-	struct btrfs_fs_info *fs_info;
+	struct inode *inode = page->mapping->host;
+	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
 	struct extent_state *state;
 	int num_copies;
 	int did_repair = 0;
 	int ret;
-	struct inode *inode = page->mapping->host;
 
 	private = 0;
 	ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
@@ -2085,6 +2089,8 @@ static int clean_io_failure(u64 start, struct page *page)
 		did_repair = 1;
 		goto out;
 	}
+	if (fs_info->sb->s_flags & MS_RDONLY)
+		goto out;
 
 	spin_lock(&BTRFS_I(inode)->io_tree.lock);
 	state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
@@ -2094,7 +2100,6 @@ static int clean_io_failure(u64 start, struct page *page)
 
 	if (state && state->start <= failrec->start &&
 	    state->end >= failrec->start + failrec->len - 1) {
-		fs_info = BTRFS_I(inode)->root->fs_info;
 		num_copies = btrfs_num_copies(fs_info, failrec->logical,
 					      failrec->len);
 		if (num_copies > 1)  {
-- 
cgit v1.3


From ba69994a40b242bef26ab2683ea84aa29e2e429f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Sun, 3 Nov 2013 19:06:40 +0200
Subject: Btrfs: fix __btrfs_start_workers retval

__btrfs_start_workers returns 0 in case it raced with
btrfs_stop_workers and lost the race.  This is wrong because worker in
this case is not allowed to start and is in fact destroyed.  Return
-EINVAL instead.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/async-thread.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 8aec751fa464..c1e0b0caf9cc 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -495,6 +495,7 @@ static int __btrfs_start_workers(struct btrfs_workers *workers)
 	spin_lock_irq(&workers->lock);
 	if (workers->stopping) {
 		spin_unlock_irq(&workers->lock);
+		ret = -EINVAL;
 		goto fail_kthread;
 	}
 	list_add_tail(&worker->worker_list, &workers->idle_list);
-- 
cgit v1.3


From d52c1bcc647ca41ee563bc38d8231e5f3816112c Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 5 Nov 2013 11:45:53 +0800
Subject: Btrfs: avoid heavy operations in btrfs_commit_super

The 'git blame' history shows that, the old transaction commit code has to do
twice to ensure roots are updated and we have to flush metadata and super block
manually, however, right now all of these can be handled well inside
the transaction commit code without extra efforts.

And the error handling part remains same with the current code, -- 'return to
caller once we get error'.

This saves us a transaction commit and a flush of super block, which are both
heavy operations according to ftrace output analysis.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/disk-io.c | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4c4ed0bb3da1..8072cfa8a3b1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3517,7 +3517,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
 int btrfs_commit_super(struct btrfs_root *root)
 {
 	struct btrfs_trans_handle *trans;
-	int ret;
 
 	mutex_lock(&root->fs_info->cleaner_mutex);
 	btrfs_run_delayed_iputs(root);
@@ -3531,25 +3530,7 @@ int btrfs_commit_super(struct btrfs_root *root)
 	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
-	ret = btrfs_commit_transaction(trans, root);
-	if (ret)
-		return ret;
-	/* run commit again to drop the original snapshot */
-	trans = btrfs_join_transaction(root);
-	if (IS_ERR(trans))
-		return PTR_ERR(trans);
-	ret = btrfs_commit_transaction(trans, root);
-	if (ret)
-		return ret;
-	ret = btrfs_write_and_wait_transaction(NULL, root);
-	if (ret) {
-		btrfs_error(root->fs_info, ret,
-			    "Failed to sync btree inode to disk.");
-		return ret;
-	}
-
-	ret = write_ctree_super(NULL, root, 0);
-	return ret;
+	return btrfs_commit_transaction(trans, root);
 }
 
 int close_ctree(struct btrfs_root *root)
-- 
cgit v1.3


From b1a06a4b574996692b72b742bf6e6aa0c711a948 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 6 Nov 2013 16:57:55 +0800
Subject: Btrfs: fix lockdep error in async commit

Lockdep complains about btrfs's async commit:

[ 2372.462171] [ BUG: bad unlock balance detected! ]
[ 2372.462191] 3.12.0+ #32 Tainted: G        W
[ 2372.462209] -------------------------------------
[ 2372.462228] ceph-osd/14048 is trying to release lock (sb_internal) at:
[ 2372.462275] [<ffffffffa022cb10>] btrfs_commit_transaction_async+0x1b0/0x2a0 [btrfs]
[ 2372.462305] but there are no more locks to release!
[ 2372.462324]
[ 2372.462324] other info that might help us debug this:
[ 2372.462349] no locks held by ceph-osd/14048.
[ 2372.462367]
[ 2372.462367] stack backtrace:
[ 2372.462386] CPU: 2 PID: 14048 Comm: ceph-osd Tainted: G        W    3.12.0+ #32
[ 2372.462414] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS 080015  11/09/2011
[ 2372.462455]  ffffffffa022cb10 ffff88007490fd28 ffffffff816f094a ffff8800378aa320
[ 2372.462491]  ffff88007490fd50 ffffffff810adf4c ffff8800378aa320 ffff88009af97650
[ 2372.462526]  ffffffffa022cb10 ffff88007490fd88 ffffffff810b01ee ffff8800898c0000
[ 2372.462562] Call Trace:
[ 2372.462584]  [<ffffffffa022cb10>] ? btrfs_commit_transaction_async+0x1b0/0x2a0 [btrfs]
[ 2372.462619]  [<ffffffff816f094a>] dump_stack+0x45/0x56
[ 2372.462642]  [<ffffffff810adf4c>] print_unlock_imbalance_bug+0xec/0x100
[ 2372.462677]  [<ffffffffa022cb10>] ? btrfs_commit_transaction_async+0x1b0/0x2a0 [btrfs]
[ 2372.462710]  [<ffffffff810b01ee>] lock_release+0x18e/0x210
[ 2372.462742]  [<ffffffffa022cb36>] btrfs_commit_transaction_async+0x1d6/0x2a0 [btrfs]
[ 2372.462783]  [<ffffffffa025a7ce>] btrfs_ioctl_start_sync+0x3e/0xc0 [btrfs]
[ 2372.462822]  [<ffffffffa025f1d3>] btrfs_ioctl+0x4c3/0x1f70 [btrfs]
[ 2372.462849]  [<ffffffff812c0321>] ? avc_has_perm+0x121/0x1b0
[ 2372.462873]  [<ffffffff812c0224>] ? avc_has_perm+0x24/0x1b0
[ 2372.462897]  [<ffffffff8107ecc8>] ? sched_clock_cpu+0xa8/0x100
[ 2372.462922]  [<ffffffff8117b145>] do_vfs_ioctl+0x2e5/0x4e0
[ 2372.462946]  [<ffffffff812c19e6>] ? file_has_perm+0x86/0xa0
[ 2372.462969]  [<ffffffff8117b3c1>] SyS_ioctl+0x81/0xa0
[ 2372.462991]  [<ffffffff817045a4>] tracesys+0xdd/0xe2

====================================================

It's because that we don't do the right thing when checking if it's ok to
tell lockdep that we're trying to release the rwsem.

If the trans handle's type is TRANS_ATTACH, we won't acquire the freeze rwsem, but
as TRANS_ATTACH fits the check (trans < TRANS_JOIN_NOLOCK), we'll release the freeze
rwsem, which makes lockdep complains a lot.

Reported-by: Ma Jianpeng <majianpeng@gmail.com>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/transaction.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 57c16b46afbd..c6a872a8a468 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1480,7 +1480,7 @@ static void do_async_commit(struct work_struct *work)
 	 * We've got freeze protection passed with the transaction.
 	 * Tell lockdep about it.
 	 */
-	if (ac->newtrans->type < TRANS_JOIN_NOLOCK)
+	if (ac->newtrans->type & __TRANS_FREEZABLE)
 		rwsem_acquire_read(
 		     &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
 		     0, 1, _THIS_IP_);
@@ -1521,7 +1521,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
 	 * Tell lockdep we've released the freeze rwsem, since the
 	 * async commit thread will be the one to unlock it.
 	 */
-	if (trans->type < TRANS_JOIN_NOLOCK)
+	if (ac->newtrans->type & __TRANS_FREEZABLE)
 		rwsem_release(
 			&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
 			1, _THIS_IP_);
-- 
cgit v1.3


From b52abf1e3b65b4fbd42b39412e040b561faec6a6 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Wed, 6 Nov 2013 15:12:40 +0000
Subject: Btrfs: don't wait for ordered data outside desired range

In btrfs_wait_ordered_range(), if we found an extent to the left
of the start of our desired wait range and the last byte of that
extent is 1 less than the desired range's start, we would would
wait for the IO completion of that extent unnecessarily.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ordered-data.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 25a8f3812f14..c37124bccf17 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -803,7 +803,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 			btrfs_put_ordered_extent(ordered);
 			break;
 		}
-		if (ordered->file_offset + ordered->len < start) {
+		if (ordered->file_offset + ordered->len <= start) {
 			btrfs_put_ordered_extent(ordered);
 			break;
 		}
-- 
cgit v1.3


From 9650e05c071fc92e704d4359d59e3ac3ecae2875 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Tue, 12 Nov 2013 19:32:04 +0800
Subject: Btrfs: remove dead codes from ctree.h

These two functions are only stated but undefined.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ctree.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f9aeb2759a64..54ab86127f7a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3613,9 +3613,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 			   struct btrfs_ordered_sum *sums);
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		       struct bio *bio, u64 file_start, int contig);
-int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root, struct btrfs_path *path,
-			u64 isize);
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 			     struct list_head *list, int search_commit);
 /* inode.c */
@@ -3744,9 +3741,6 @@ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
 void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			     int skip_pinned);
-int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace,
-			       u64 start, u64 end, int skip_pinned,
-			       int modified);
 extern const struct file_operations btrfs_file_operations;
 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root, struct inode *inode,
-- 
cgit v1.3


From 56d140f5f61d3547dcc194de9cd5193b957b8016 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Wed, 13 Nov 2013 17:19:08 +0100
Subject: Btrfs: print bytenr instead of page pointer in check-int

The page pointer information was useless. The bytenr is what you
want when you search for submitted write bios.

Additionally, a new bit in the print mask is added that allows
to selectively enable the check-int submit_bio verbose mode. Before,
the global verbose mode had to be enabled leading to many million
useless lines in the kernel log.

And a comment is added that explains that LOG_BUF_SHIFT needs to
be set to a really high value.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/check-integrity.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index e0aab4456974..b50764bef141 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -77,6 +77,15 @@
  * the integrity of (super)-block write requests, do not
  * enable the config option BTRFS_FS_CHECK_INTEGRITY to
  * include and compile the integrity check tool.
+ *
+ * Expect millions of lines of information in the kernel log with an
+ * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the
+ * kernel config to at least 26 (which is 64MB). Usually the value is
+ * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be
+ * changed like this before LOG_BUF_SHIFT can be set to a high value:
+ * config LOG_BUF_SHIFT
+ *       int "Kernel log buffer size (16 => 64KB, 17 => 128KB)"
+ *       range 12 30
  */
 
 #include <linux/sched.h>
@@ -124,6 +133,7 @@
 #define BTRFSIC_PRINT_MASK_INITIAL_DATABASE			0x00000400
 #define BTRFSIC_PRINT_MASK_NUM_COPIES				0x00000800
 #define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS		0x00001000
+#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE		0x00002000
 
 struct btrfsic_dev_state;
 struct btrfsic_state;
@@ -3015,6 +3025,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
 	    (rw & WRITE) && NULL != bio->bi_io_vec) {
 		unsigned int i;
 		u64 dev_bytenr;
+		u64 cur_bytenr;
 		int bio_is_patched;
 		char **mapped_datav;
 
@@ -3033,6 +3044,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
 				       GFP_NOFS);
 		if (!mapped_datav)
 			goto leave;
+		cur_bytenr = dev_bytenr;
 		for (i = 0; i < bio->bi_vcnt; i++) {
 			BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE);
 			mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page);
@@ -3044,16 +3056,13 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
 				kfree(mapped_datav);
 				goto leave;
 			}
-			if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
-			     BTRFSIC_PRINT_MASK_VERBOSE) ==
-			    (dev_state->state->print_mask &
-			     (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
-			      BTRFSIC_PRINT_MASK_VERBOSE)))
+			if (dev_state->state->print_mask &
+			    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
 				printk(KERN_INFO
-				       "#%u: page=%p, len=%u, offset=%u\n",
-				       i, bio->bi_io_vec[i].bv_page,
-				       bio->bi_io_vec[i].bv_len,
+				       "#%u: bytenr=%llu, len=%u, offset=%u\n",
+				       i, cur_bytenr, bio->bi_io_vec[i].bv_len,
 				       bio->bi_io_vec[i].bv_offset);
+			cur_bytenr += bio->bi_io_vec[i].bv_len;
 		}
 		btrfsic_process_written_block(dev_state, dev_bytenr,
 					      mapped_datav, bio->bi_vcnt,
-- 
cgit v1.3


From 931aa87791af46640a46b11fa503a119e36943ec Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 14 Nov 2013 17:33:21 +0800
Subject: Btrfs: fix list delete warning when removing ordered root from the
 list

Commit b02441999efcc6152b87cd58e7970bb7843f76cf "Btrfs: don't wait for
the completion of all the ordered extents" introduced a bug that broke
the ordered root list:
 WARNING: CPU: 1 PID: 7119 at lib/list_debug.c:59 __list_del_entry+0x5a/0x98()

It is because we forgot to return the roots in the splice list to the
ordered list of the fs. Fix it.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ordered-data.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index c37124bccf17..69582d5b69d1 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -638,6 +638,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
 			WARN_ON(nr < 0);
 		}
 	}
+	list_splice_tail(&splice, &fs_info->ordered_roots);
 	spin_unlock(&fs_info->ordered_root_lock);
 }
 
-- 
cgit v1.3


From 52a157592140ac76d9f15637543aecc2100f95cb Mon Sep 17 00:00:00 2001
From: Anand Jain <Anand.Jain@oracle.com>
Date: Thu, 14 Nov 2013 19:52:44 +0800
Subject: btrfs: fix typo in the log message

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/dev-replace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 342f9fd411e3..2cfc3dfff64f 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -366,7 +366,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	dev_replace->tgtdev = tgt_device;
 
 	printk_in_rcu(KERN_INFO
-		      "btrfs: dev_replace from %s (devid %llu) to %s) started\n",
+		      "btrfs: dev_replace from %s (devid %llu) to %s started\n",
 		      src_device->missing ? "<missing disk>" :
 		        rcu_str_deref(src_device->name),
 		      src_device->devid,
-- 
cgit v1.3


From 4cd8587ce8fb79e49d1d6d1fc065f056188fb86a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 14 Nov 2013 22:57:29 -0500
Subject: btrfs: Use trace condition for get_extent tracepoint

Doing an if statement to test some condition to know if we should
trigger a tracepoint is pointless when tracing is disabled. This just
adds overhead and wastes a branch prediction. This is why the
TRACE_EVENT_CONDITION() was created. It places the check inside the jump
label so that the branch does not happen unless tracing is enabled.

That is, instead of doing:

	if (em)
		trace_btrfs_get_extent(root, em);

Which is basically this:

	if (em)
		if (static_key(trace_btrfs_get_extent)) {

Using a TRACE_EVENT_CONDITION() we can just do:

	trace_btrfs_get_extent(root, em);

And the condition trace event will do:

	if (static_key(trace_btrfs_get_extent)) {
		if (em) {
			...

The static key is a non conditional jump (or nop) that is faster than
having to check if em is NULL or not.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/inode.c             | 3 +--
 include/trace/events/btrfs.h | 4 +++-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fd67b34e220d..f1a77449d032 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6187,8 +6187,7 @@ insert:
 	write_unlock(&em_tree->lock);
 out:
 
-	if (em)
-		trace_btrfs_get_extent(root, em);
+	trace_btrfs_get_extent(root, em);
 
 	if (path)
 		btrfs_free_path(path);
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index f18b3b76e01e..4832d75dcbae 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -162,12 +162,14 @@ DEFINE_EVENT(btrfs__inode, btrfs_inode_evict,
 		{ EXTENT_FLAG_LOGGING,	 	"LOGGING" 	},	\
 		{ EXTENT_FLAG_FILLING,	 	"FILLING" 	})
 
-TRACE_EVENT(btrfs_get_extent,
+TRACE_EVENT_CONDITION(btrfs_get_extent,
 
 	TP_PROTO(struct btrfs_root *root, struct extent_map *map),
 
 	TP_ARGS(root, map),
 
+	TP_CONDITION(map),
+
 	TP_STRUCT__entry(
 		__field(	u64,  root_objectid	)
 		__field(	u64,  start		)
-- 
cgit v1.3


From 475bf36ffba2ad18631865a2429468dc6c7ea919 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Mon, 18 Nov 2013 22:13:18 +0900
Subject: btrfs: fix bio_size_ok() for max_sectors > 0xffff

The data type of max_sectors in queue settings is unsigned int.  But
this value is stored to the local variable whose type is unsigned short
in bio_size_ok().  This can cause unexpected result when max_sectors >
0xffff.

Cc: Chris Mason <chris.mason@fusionio.com>
Cc: linux-btrfs@vger.kernel.org
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0db637097862..92303f42baaa 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5394,7 +5394,7 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio,
 {
 	struct bio_vec *prev;
 	struct request_queue *q = bdev_get_queue(bdev);
-	unsigned short max_sectors = queue_max_sectors(q);
+	unsigned int max_sectors = queue_max_sectors(q);
 	struct bvec_merge_data bvm = {
 		.bi_bdev = bdev,
 		.bi_sector = sector,
-- 
cgit v1.3


From 4204617d142c0887e45fda2562cb5c58097b918e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Wed, 20 Nov 2013 14:32:34 +0100
Subject: btrfs: update kconfig help text

Reflect the current status. Portions of the text taken from the
wiki pages.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/Kconfig | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 398cbd517be2..f62389348662 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -9,12 +9,17 @@ config BTRFS_FS
 	select XOR_BLOCKS
 
 	help
-	  Btrfs is a new filesystem with extents, writable snapshotting,
-	  support for multiple devices and many more features.
+	  Btrfs is a general purpose copy-on-write filesystem with extents,
+	  writable snapshotting, support for multiple devices and many more
+	  features focused on fault tolerance, repair and easy administration.
 
-	  Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
-	  FINALIZED.  You should say N here unless you are interested in
-	  testing Btrfs with non-critical data.
+	  The filesystem disk format is no longer unstable, and it's not
+	  expected to change unless there are strong reasons to do so. If there
+	  is a format change, file systems with a unchanged format will
+	  continue to be mountable and usable by newer kernels.
+
+	  For more information, please see the web pages at
+	  http://btrfs.wiki.kernel.org.
 
 	  To compile this file system support as a module, choose M here. The
 	  module will be called btrfs.
-- 
cgit v1.3


From e3c4269d139db7471ee560fb30948179e180b3d4 Mon Sep 17 00:00:00 2001
From: Michal Nazarewicz <mina86@mina86.com>
Date: Tue, 12 Nov 2013 13:30:05 +0100
Subject: GFS2: fix potential NULL pointer dereference

Commit [e66cf1610: GFS2: Use lockref for glocks] replaced call:
    atomic_read(&gi->gl->gl_ref) == 0
with:
    __lockref_is_dead(&gl->gl_lockref)
therefore changing how gl is accessed, from gi->gl to plan gl.
However, gl can be a NULL pointer, and so gi->gl needs to be
used instead (which is guaranteed not to be NULL because fo
the while loop checking that condition).

Signed-off-by: Michal Nazarewicz <mina86@mina86.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index e66a8009aff1..c8420f7e4db6 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1899,7 +1899,8 @@ static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
 			gi->nhash = 0;
 		}
 	/* Skip entries for other sb and dead entries */
-	} while (gi->sdp != gi->gl->gl_sbd || __lockref_is_dead(&gl->gl_lockref));
+	} while (gi->sdp != gi->gl->gl_sbd ||
+		 __lockref_is_dead(&gi->gl->gl_lockref));
 
 	return 0;
 }
-- 
cgit v1.3


From ea0341e071527d5cec350917b01ab901af09d758 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 21 Nov 2013 18:47:57 +0000
Subject: GFS2: Fix ref count bug relating to atomic_open

In the case that atomic_open calls finish_no_open() with
the dentry that was supplied to gfs2_atomic_open() an
extra reference count is required. This patch fixes that
issue preventing a bug trap triggering at umount time.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 1615df16cf4e..7119504159f1 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1171,8 +1171,11 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
 	if (d != NULL)
 		dentry = d;
 	if (dentry->d_inode) {
-		if (!(*opened & FILE_OPENED))
+		if (!(*opened & FILE_OPENED)) {
+			if (d == NULL)
+				dget(dentry);
 			return finish_no_open(file, dentry);
+		}
 		dput(d);
 		return 0;
 	}
-- 
cgit v1.3


From 76ae281f6307331aa063288edb6422ae99f435f0 Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Thu, 21 Nov 2013 14:31:56 -0800
Subject: configfs: fix race between dentry put and lookup

A race window in configfs, it starts from one dentry is UNHASHED and end
before configfs_d_iput is called.  In this window, if a lookup happen,
since the original dentry was UNHASHED, so a new dentry will be
allocated, and then in configfs_attach_attr(), sd->s_dentry will be
updated to the new dentry.  Then in configfs_d_iput(),
BUG_ON(sd->s_dentry != dentry) will be triggered and system panic.

sys_open:                     sys_close:
 ...                           fput
                                dput
                                 dentry_kill
                                  __d_drop <--- dentry unhashed here,
                                           but sd->dentry still point
                                           to this dentry.

 lookup_real
  configfs_lookup
   configfs_attach_attr---> update sd->s_dentry
                            to new allocated dentry here.

                                   d_kill
                                     configfs_d_iput <--- BUG_ON(sd->s_dentry != dentry)
                                                     triggered here.

To fix it, change configfs_d_iput to not update sd->s_dentry if
sd->s_count > 2, that means there are another dentry is using the sd
beside the one that is going to be put.  Use configfs_dirent_lock in
configfs_attach_attr to sync with configfs_d_iput.

With the following steps, you can reproduce the bug.

1. enable ocfs2, this will mount configfs at /sys/kernel/config and
   fill configure in it.

2. run the following script.
	while [ 1 ]; do cat /sys/kernel/config/cluster/$your_cluster_name/idle_timeout_ms > /dev/null; done &
	while [ 1 ]; do cat /sys/kernel/config/cluster/$your_cluster_name/idle_timeout_ms > /dev/null; done &

Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/configfs/dir.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 4522e0755773..e081acbac2e7 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -56,10 +56,19 @@ static void configfs_d_iput(struct dentry * dentry,
 	struct configfs_dirent *sd = dentry->d_fsdata;
 
 	if (sd) {
-		BUG_ON(sd->s_dentry != dentry);
 		/* Coordinate with configfs_readdir */
 		spin_lock(&configfs_dirent_lock);
-		sd->s_dentry = NULL;
+		/* Coordinate with configfs_attach_attr where will increase
+		 * sd->s_count and update sd->s_dentry to new allocated one.
+		 * Only set sd->dentry to null when this dentry is the only
+		 * sd owner.
+		 * If not do so, configfs_d_iput may run just after
+		 * configfs_attach_attr and set sd->s_dentry to null
+		 * even it's still in use.
+		 */
+		if (atomic_read(&sd->s_count) <= 2)
+			sd->s_dentry = NULL;
+
 		spin_unlock(&configfs_dirent_lock);
 		configfs_put(sd);
 	}
@@ -416,8 +425,11 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
 	struct configfs_attribute * attr = sd->s_element;
 	int error;
 
+	spin_lock(&configfs_dirent_lock);
 	dentry->d_fsdata = configfs_get(sd);
 	sd->s_dentry = dentry;
+	spin_unlock(&configfs_dirent_lock);
+
 	error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG,
 				configfs_init_file);
 	if (error) {
-- 
cgit v1.3