summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Morton <akpm@digeo.com>2003-06-17 18:26:42 -0700
committerLinus Torvalds <torvalds@home.transmeta.com>2003-06-17 18:26:42 -0700
commitc12b9866ea52183ad5f2d87ca01c09b3a7508f5a (patch)
tree14c7df6b50aed0a4c41a18110113b6442d9642de
parent78f2f471204076997e5360dd7ffe807b9eb1e5ce (diff)
[PATCH] ext3: concurrent block/inode allocation
From: Alex Tomas <bzzz@tmi.comex.ru> This patch weans ext3 off lock_super()-based protection for the inode and block allocators. It's basically the same as the ext2 changes. 1) each group has own spinlock, which is used for group counter modifications 2) sb->s_free_blocks_count isn't used any more. ext2_statfs() and find_group_orlov() loop over groups to count free blocks 3) sb->s_free_blocks_count is recalculated at mount/umount/sync_super time in order to check consistency and to avoid fsck warnings 4) reserved blocks are distributed over last groups 5) ext3_new_block() tries to use non-reserved blocks and if it fails then tries to use reserved blocks 6) ext3_new_block() and ext3_free_blocks do not modify sb->s_free_blocks, therefore they do not call mark_buffer_dirty() for superblock's buffer_head. this should reduce I/O a bit Also fix orlov allocator boundary case: In the interests of SMP scalability the ext2 free blocks and free inodes counters are "approximate". But there is a piece of code in the Orlov allocator which fails due to boundary conditions on really small filesystems. Fix that up via a final allocation pass which simply uses first-fit for allocatiopn of a directory inode.
-rw-r--r--fs/ext3/balloc.c298
-rw-r--r--fs/ext3/ialloc.c79
-rw-r--r--fs/ext3/super.c59
-rw-r--r--fs/jbd/journal.c2
-rw-r--r--fs/jbd/transaction.c2
-rw-r--r--include/linux/ext3_fs.h2
-rw-r--r--include/linux/ext3_fs_sb.h10
-rw-r--r--include/linux/ext3_jbd.h6
8 files changed, 280 insertions, 178 deletions
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index e0a02e094047..c792e232a493 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -118,7 +118,6 @@ void ext3_free_blocks (handle_t *handle, struct inode * inode,
printk ("ext3_free_blocks: nonexistent device");
return;
}
- lock_super (sb);
es = EXT3_SB(sb)->s_es;
if (block < le32_to_cpu(es->s_first_data_block) ||
block + count < block ||
@@ -184,11 +183,6 @@ do_more:
if (err)
goto error_return;
- BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
- err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
- if (err)
- goto error_return;
-
for (i = 0; i < count; i++) {
/*
* An HJ special. This is expensive...
@@ -207,19 +201,6 @@ do_more:
}
}
#endif
- BUFFER_TRACE(bitmap_bh, "clear bit");
- if (!ext3_clear_bit (bit + i, bitmap_bh->b_data)) {
- ext3_error (sb, __FUNCTION__,
- "bit already cleared for block %lu",
- block + i);
- BUFFER_TRACE(bitmap_bh, "bit already cleared");
- } else {
- dquot_freed_blocks++;
- gdp->bg_free_blocks_count =
- cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)+1);
- es->s_free_blocks_count =
- cpu_to_le32(le32_to_cpu(es->s_free_blocks_count)+1);
- }
/* @@@ This prevents newly-allocated data from being
* freed and then reallocated within the same
* transaction.
@@ -238,12 +219,35 @@ do_more:
* activity on the buffer any more and so it is safe to
* reallocate it.
*/
- BUFFER_TRACE(bitmap_bh, "clear in b_committed_data");
+ BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
J_ASSERT_BH(bitmap_bh,
bh2jh(bitmap_bh)->b_committed_data != NULL);
- ext3_set_bit(bit + i, bh2jh(bitmap_bh)->b_committed_data);
+ ext3_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
+ bh2jh(bitmap_bh)->b_committed_data);
+
+ /*
+ * We clear the bit in the bitmap after setting the committed
+ * data bit, because this is the reverse order to that which
+ * the allocator uses.
+ */
+ BUFFER_TRACE(bitmap_bh, "clear bit");
+ if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+ bit + i, bitmap_bh->b_data)) {
+ ext3_error (sb, __FUNCTION__,
+ "bit already cleared for block %lu",
+ block + i);
+ BUFFER_TRACE(bitmap_bh, "bit already cleared");
+ } else {
+ dquot_freed_blocks++;
+ }
}
+ spin_lock(bg_lock(sb, block_group));
+ gdp->bg_free_blocks_count =
+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) +
+ dquot_freed_blocks);
+ spin_unlock(bg_lock(sb, block_group));
+
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
err = ext3_journal_dirty_metadata(handle, bitmap_bh);
@@ -253,11 +257,6 @@ do_more:
ret = ext3_journal_dirty_metadata(handle, gd_bh);
if (!err) err = ret;
- /* And the superblock */
- BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "dirtied superblock");
- ret = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
- if (!err) err = ret;
-
if (overflow && !err) {
block += count;
count = overflow;
@@ -267,7 +266,6 @@ do_more:
error_return:
brelse(bitmap_bh);
ext3_std_error(sb, err);
- unlock_super(sb);
if (dquot_freed_blocks)
DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
return;
@@ -368,6 +366,98 @@ static int find_next_usable_block(int start,
}
/*
+ * We think we can allocate this block in this bitmap. Try to set the bit.
+ * If that succeeds then check that nobody has allocated and then freed the
+ * block since we saw that is was not marked in b_committed_data. If it _was_
+ * allocated and freed then clear the bit in the bitmap again and return
+ * zero (failure).
+ */
+static inline int
+claim_block(spinlock_t *lock, int block, struct buffer_head *bh)
+{
+ if (ext3_set_bit_atomic(lock, block, bh->b_data))
+ return 0;
+ if (buffer_jbd(bh) && bh2jh(bh)->b_committed_data &&
+ ext3_test_bit(block, bh2jh(bh)->b_committed_data)) {
+ ext3_clear_bit_atomic(lock, block, bh->b_data);
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * If we failed to allocate the desired block then we may end up crossing to a
+ * new bitmap. In that case we must release write access to the old one via
+ * ext3_journal_release_buffer(), else we'll run out of credits.
+ */
+static int
+ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
+ struct buffer_head *bitmap_bh, int goal, int *errp)
+{
+ int i, fatal = 0;
+ int have_access = 0;
+
+ *errp = 0;
+
+ if (goal >= 0 && ext3_test_allocatable(goal, bitmap_bh))
+ goto got;
+
+repeat:
+ goal = find_next_usable_block(goal, bitmap_bh,
+ EXT3_BLOCKS_PER_GROUP(sb));
+ if (goal < 0)
+ goto fail;
+
+ for (i = 0;
+ i < 7 && goal > 0 && ext3_test_allocatable(goal - 1, bitmap_bh);
+ i++, goal--);
+
+got:
+ if (!have_access) {
+ /*
+ * Make sure we use undo access for the bitmap, because it is
+ * critical that we do the frozen_data COW on bitmap buffers in
+ * all cases even if the buffer is in BJ_Forget state in the
+ * committing transaction.
+ */
+ BUFFER_TRACE(bitmap_bh, "get undo access for new block");
+ fatal = ext3_journal_get_undo_access(handle, bitmap_bh);
+ if (fatal) {
+ *errp = fatal;
+ goto fail;
+ }
+ have_access = 1;
+ }
+
+ if (!claim_block(bg_lock(sb, group), goal, bitmap_bh)) {
+ /*
+ * The block was allocated by another thread, or it was
+ * allocated and then freed by another thread
+ */
+ goal++;
+ if (goal >= EXT3_BLOCKS_PER_GROUP(sb))
+ goto fail;
+ goto repeat;
+ }
+
+ BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for bitmap block");
+ fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
+ if (fatal) {
+ *errp = fatal;
+ goto fail;
+ }
+
+ return goal;
+fail:
+ if (have_access) {
+ BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
+ ext3_journal_release_buffer(handle, bitmap_bh);
+ }
+ return -1;
+}
+
+
+/*
* ext3_new_block uses a goal block to assist allocation. If the goal is
* free, or there is a free block within 32 blocks of the goal, that block
* is allocated. Otherwise a forward search is made for a free block; within
@@ -383,10 +473,12 @@ ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal,
struct buffer_head *gdp_bh; /* bh2 */
int group_no; /* i */
int ret_block; /* j */
- int bit; /* k */
+ int bgi; /* blockgroup iteration index */
int target_block; /* tmp */
int fatal = 0, err;
int performed_allocation = 0;
+ int free;
+ int use_reserve = 0;
struct super_block *sb;
struct ext3_group_desc *gdp;
struct ext3_super_block *es;
@@ -408,16 +500,7 @@ ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal,
return 0;
}
- lock_super(sb);
es = EXT3_SB(sb)->s_es;
- if (le32_to_cpu(es->s_free_blocks_count) <=
- le32_to_cpu(es->s_r_blocks_count) &&
- ((EXT3_SB(sb)->s_resuid != current->fsuid) &&
- (EXT3_SB(sb)->s_resgid == 0 ||
- !in_group_p(EXT3_SB(sb)->s_resgid)) &&
- !capable(CAP_SYS_RESOURCE)))
- goto out;
-
ext3_debug("goal=%lu.\n", goal);
/*
@@ -432,40 +515,28 @@ ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal,
if (!gdp)
goto io_error;
- if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) {
+ free = le16_to_cpu(gdp->bg_free_blocks_count);
+ free -= EXT3_SB(sb)->s_bgi[group_no].bg_reserved;
+ if (free > 0) {
ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
EXT3_BLOCKS_PER_GROUP(sb));
-#ifdef EXT3FS_DEBUG
- if (ret_block)
- goal_attempts++;
-#endif
bitmap_bh = read_block_bitmap(sb, group_no);
if (!bitmap_bh)
- goto io_error;
-
- ext3_debug("goal is at %d:%d.\n", group_no, ret_block);
-
- if (ext3_test_allocatable(ret_block, bitmap_bh)) {
-#ifdef EXT3FS_DEBUG
- goal_hits++;
- ext3_debug("goal bit allocated.\n");
-#endif
- goto got_block;
- }
-
- ret_block = find_next_usable_block(ret_block, bitmap_bh,
- EXT3_BLOCKS_PER_GROUP(sb));
+ goto io_error;
+ ret_block = ext3_try_to_allocate(sb, handle, group_no,
+ bitmap_bh, ret_block, &fatal);
+ if (fatal)
+ goto out;
if (ret_block >= 0)
- goto search_back;
+ goto allocated;
}
-
- ext3_debug("Bit not found in block group %d.\n", group_no);
-
+
/*
* Now search the rest of the groups. We assume that
* i and gdp correctly point to the last group visited.
*/
- for (bit = 0; bit < EXT3_SB(sb)->s_groups_count; bit++) {
+repeat:
+ for (bgi = 0; bgi < EXT3_SB(sb)->s_groups_count; bgi++) {
group_no++;
if (group_no >= EXT3_SB(sb)->s_groups_count)
group_no = 0;
@@ -474,57 +545,47 @@ ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal,
*errp = -EIO;
goto out;
}
- if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) {
- brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, group_no);
- if (!bitmap_bh)
- goto io_error;
- ret_block = find_next_usable_block(-1, bitmap_bh,
- EXT3_BLOCKS_PER_GROUP(sb));
- if (ret_block >= 0)
- goto search_back;
- }
+ free = le16_to_cpu(gdp->bg_free_blocks_count);
+ if (!use_reserve)
+ free -= EXT3_SB(sb)->s_bgi[group_no].bg_reserved;
+ if (free <= 0)
+ continue;
+
+ brelse(bitmap_bh);
+ bitmap_bh = read_block_bitmap(sb, group_no);
+ if (!bitmap_bh)
+ goto io_error;
+ ret_block = ext3_try_to_allocate(sb, handle, group_no,
+ bitmap_bh, -1, &fatal);
+ if (fatal)
+ goto out;
+ if (ret_block >= 0)
+ goto allocated;
+ }
+
+ if (!use_reserve &&
+ (EXT3_SB(sb)->s_resuid == current->fsuid ||
+ (EXT3_SB(sb)->s_resgid != 0 && in_group_p(EXT3_SB(sb)->s_resgid)) ||
+ capable(CAP_SYS_RESOURCE))) {
+ use_reserve = 1;
+ group_no = 0;
+ goto repeat;
}
/* No space left on the device */
+ *errp = -ENOSPC;
goto out;
-search_back:
- /*
- * We have succeeded in finding a free byte in the block
- * bitmap. Now search backwards up to 7 bits to find the
- * start of this group of free blocks.
- */
- for ( bit = 0;
- bit < 7 && ret_block > 0 &&
- ext3_test_allocatable(ret_block - 1, bitmap_bh);
- bit++, ret_block--)
- ;
-
-got_block:
+allocated:
ext3_debug("using block group %d(%d)\n",
group_no, gdp->bg_free_blocks_count);
- /* Make sure we use undo access for the bitmap, because it is
- critical that we do the frozen_data COW on bitmap buffers in
- all cases even if the buffer is in BJ_Forget state in the
- committing transaction. */
- BUFFER_TRACE(bitmap_bh, "get undo access for marking new block");
- fatal = ext3_journal_get_undo_access(handle, bitmap_bh);
- if (fatal)
- goto out;
-
BUFFER_TRACE(gdp_bh, "get_write_access");
fatal = ext3_journal_get_write_access(handle, gdp_bh);
if (fatal)
goto out;
- BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
- fatal = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
- if (fatal)
- goto out;
-
target_block = ret_block + group_no * EXT3_BLOCKS_PER_GROUP(sb)
+ le32_to_cpu(es->s_first_data_block);
@@ -536,11 +597,6 @@ got_block:
"Allocating block in system zone - "
"block = %u", target_block);
- /* The superblock lock should guard against anybody else beating
- * us to this point! */
- J_ASSERT_BH(bitmap_bh, !ext3_test_bit(ret_block, bitmap_bh->b_data));
- BUFFER_TRACE(bitmap_bh, "setting bitmap bit");
- ext3_set_bit(ret_block, bitmap_bh->b_data);
performed_allocation = 1;
#ifdef CONFIG_JBD_DEBUG
@@ -556,20 +612,17 @@ got_block:
}
}
#endif
+ spin_lock(bg_lock(sb, group_no));
if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data)
J_ASSERT_BH(bitmap_bh,
!ext3_test_bit(ret_block,
bh2jh(bitmap_bh)->b_committed_data));
ext3_debug("found bit %d\n", ret_block);
+ spin_unlock(bg_lock(sb, group_no));
/* ret_block was blockgroup-relative. Now it becomes fs-relative */
ret_block = target_block;
- BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for bitmap block");
- err = ext3_journal_dirty_metadata(handle, bitmap_bh);
- if (!fatal)
- fatal = err;
-
if (ret_block >= le32_to_cpu(es->s_blocks_count)) {
ext3_error(sb, "ext3_new_block",
"block(%d) >= blocks count(%d) - "
@@ -586,27 +639,20 @@ got_block:
ext3_debug("allocating block %d. Goal hits %d of %d.\n",
ret_block, goal_hits, goal_attempts);
+ spin_lock(bg_lock(sb, group_no));
gdp->bg_free_blocks_count =
cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1);
- es->s_free_blocks_count =
- cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - 1);
+ spin_unlock(bg_lock(sb, group_no));
BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
err = ext3_journal_dirty_metadata(handle, gdp_bh);
if (!fatal)
fatal = err;
- BUFFER_TRACE(EXT3_SB(sb)->s_sbh,
- "journal_dirty_metadata for superblock");
- err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
- if (!fatal)
- fatal = err;
-
sb->s_dirt = 1;
if (fatal)
goto out;
- unlock_super(sb);
*errp = 0;
brelse(bitmap_bh);
return ret_block;
@@ -618,7 +664,6 @@ out:
*errp = fatal;
ext3_std_error(sb, fatal);
}
- unlock_super(sb);
/*
* Undo the block allocation
*/
@@ -631,12 +676,13 @@ out:
unsigned long ext3_count_free_blocks(struct super_block *sb)
{
+ unsigned long desc_count;
+ struct ext3_group_desc *gdp;
+ int i;
#ifdef EXT3FS_DEBUG
struct ext3_super_block *es;
- unsigned long desc_count, bitmap_count, x;
+ unsigned long bitmap_count, x;
struct buffer_head *bitmap_bh = NULL;
- struct ext3_group_desc *gdp;
- int i;
lock_super(sb);
es = EXT3_SB(sb)->s_es;
@@ -664,7 +710,15 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
unlock_super(sb);
return bitmap_count;
#else
- return le32_to_cpu(EXT3_SB(sb)->s_es->s_free_blocks_count);
+ desc_count = 0;
+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
+ gdp = ext3_get_group_desc(sb, i, NULL);
+ if (!gdp)
+ continue;
+ desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+ }
+
+ return desc_count;
#endif
}
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 155c19c4ac92..ab12862053d7 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -131,7 +131,6 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
/* Do this BEFORE marking the inode not in use or returning an error */
clear_inode (inode);
- lock_super (sb);
es = EXT3_SB(sb)->s_es;
if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
ext3_error (sb, "ext3_free_inode",
@@ -150,7 +149,8 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
goto error_return;
/* Ok, now we can actually update the inode bitmaps.. */
- if (!ext3_clear_bit(bit, bitmap_bh->b_data))
+ if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+ bit, bitmap_bh->b_data))
ext3_error (sb, "ext3_free_inode",
"bit already cleared for inode %lu", ino);
else {
@@ -160,28 +160,18 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
fatal = ext3_journal_get_write_access(handle, bh2);
if (fatal) goto error_return;
- BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get write access");
- fatal = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
- if (fatal) goto error_return;
-
if (gdp) {
+ spin_lock(&EXT3_SB(sb)->s_bgi[block_group].bg_ialloc_lock);
gdp->bg_free_inodes_count = cpu_to_le16(
le16_to_cpu(gdp->bg_free_inodes_count) + 1);
- if (is_directory) {
+ if (is_directory)
gdp->bg_used_dirs_count = cpu_to_le16(
le16_to_cpu(gdp->bg_used_dirs_count) - 1);
- EXT3_SB(sb)->s_dir_count--;
- }
+ spin_unlock(&EXT3_SB(sb)->s_bgi[block_group].bg_ialloc_lock);
}
BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
err = ext3_journal_dirty_metadata(handle, bh2);
if (!fatal) fatal = err;
- es->s_free_inodes_count =
- cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) + 1);
- BUFFER_TRACE(EXT3_SB(sb)->s_sbh,
- "call ext3_journal_dirty_metadata");
- err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
- if (!fatal) fatal = err;
}
BUFFER_TRACE(bitmap_bh, "call ext3_journal_dirty_metadata");
err = ext3_journal_dirty_metadata(handle, bitmap_bh);
@@ -191,7 +181,6 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
error_return:
brelse(bitmap_bh);
ext3_std_error(sb, fatal);
- unlock_super(sb);
}
/*
@@ -206,9 +195,8 @@ error_return:
*/
static int find_group_dir(struct super_block *sb, struct inode *parent)
{
- struct ext3_super_block * es = EXT3_SB(sb)->s_es;
int ngroups = EXT3_SB(sb)->s_groups_count;
- int avefreei = le32_to_cpu(es->s_free_inodes_count) / ngroups;
+ int avefreei = ext3_count_free_inodes(sb) / ngroups;
struct ext3_group_desc *desc, *best_desc = NULL;
struct buffer_head *bh;
int group, best_group = -1;
@@ -264,10 +252,12 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
struct ext3_super_block *es = sbi->s_es;
int ngroups = sbi->s_groups_count;
int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
- int avefreei = le32_to_cpu(es->s_free_inodes_count) / ngroups;
- int avefreeb = le32_to_cpu(es->s_free_blocks_count) / ngroups;
+ int freei = ext3_count_free_inodes(sb);
+ int avefreei = freei / ngroups;
+ int freeb = ext3_count_free_blocks(sb);
+ int avefreeb = freeb / ngroups;
int blocks_per_dir;
- int ndirs = sbi->s_dir_count;
+ int ndirs = ext3_count_dirs(sb);
int max_debt, max_dirs, min_blocks, min_inodes;
int group = -1, i;
struct ext3_group_desc *desc;
@@ -319,7 +309,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
desc = ext3_get_group_desc (sb, group, &bh);
if (!desc || !desc->bg_free_inodes_count)
continue;
- if (sbi->s_debts[group] >= max_debt)
+ if (sbi->s_bgi[group].bg_debts >= max_debt)
continue;
if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
continue;
@@ -340,6 +330,15 @@ fallback:
return group;
}
+ if (avefreei) {
+ /*
+ * The free-inodes counter is approximate, and for really small
+ * filesystems the above test can fail to find any blockgroups
+ */
+ avefreei = 0;
+ goto fallback;
+ }
+
return -1;
}
@@ -435,7 +434,6 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
return ERR_PTR(-ENOMEM);
ei = EXT3_I(inode);
- lock_super (sb);
es = EXT3_SB(sb)->s_es;
repeat:
if (S_ISDIR(mode)) {
@@ -464,11 +462,9 @@ repeat:
err = ext3_journal_get_write_access(handle, bitmap_bh);
if (err) goto fail;
- if (ext3_set_bit(ino, bitmap_bh->b_data)) {
- ext3_error (sb, "ext3_new_inode",
- "bit already set for inode %lu", ino);
+ if (ext3_set_bit_atomic(sb_bgl_lock(sbi, group),
+ ino, bitmap_bh->b_data))
goto repeat;
- }
BUFFER_TRACE(bitmap_bh, "call ext3_journal_dirty_metadata");
err = ext3_journal_dirty_metadata(handle, bitmap_bh);
if (err) goto fail;
@@ -504,26 +500,19 @@ repeat:
BUFFER_TRACE(bh2, "get_write_access");
err = ext3_journal_get_write_access(handle, bh2);
if (err) goto fail;
+ spin_lock(&EXT3_SB(sb)->s_bgi[group].bg_ialloc_lock);
gdp->bg_free_inodes_count =
cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
if (S_ISDIR(mode)) {
gdp->bg_used_dirs_count =
cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
- EXT3_SB(sb)->s_dir_count++;
}
+ spin_unlock(&EXT3_SB(sb)->s_bgi[group].bg_ialloc_lock);
BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
err = ext3_journal_dirty_metadata(handle, bh2);
if (err) goto fail;
- BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
- err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
- if (err) goto fail;
- es->s_free_inodes_count =
- cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1);
- BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata");
- err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
sb->s_dirt = 1;
- if (err) goto fail;
inode->i_uid = current->fsuid;
if (test_opt (sb, GRPID))
@@ -576,7 +565,6 @@ repeat:
ei->i_state = EXT3_STATE_NEW;
- unlock_super(sb);
ret = inode;
if(DQUOT_ALLOC_INODE(inode)) {
DQUOT_DROP(inode);
@@ -600,7 +588,6 @@ repeat:
fail:
ext3_std_error(sb, err);
out:
- unlock_super(sb);
iput(inode);
ret = ERR_PTR(err);
really_out:
@@ -673,12 +660,13 @@ out:
unsigned long ext3_count_free_inodes (struct super_block * sb)
{
+ unsigned long desc_count;
+ struct ext3_group_desc *gdp;
+ int i;
#ifdef EXT3FS_DEBUG
struct ext3_super_block *es;
- unsigned long desc_count, bitmap_count, x;
- struct ext3_group_desc *gdp;
+ unsigned long bitmap_count, x;
struct buffer_head *bitmap_bh = NULL;
- int i;
lock_super (sb);
es = EXT3_SB(sb)->s_es;
@@ -706,7 +694,14 @@ unsigned long ext3_count_free_inodes (struct super_block * sb)
unlock_super(sb);
return desc_count;
#else
- return le32_to_cpu(EXT3_SB(sb)->s_es->s_free_inodes_count);
+ desc_count = 0;
+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
+ gdp = ext3_get_group_desc (sb, i, NULL);
+ if (!gdp)
+ continue;
+ desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+ }
+ return desc_count;
#endif
}
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 53b6632b6025..cc3dfd7e907c 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -460,7 +460,7 @@ void ext3_put_super (struct super_block * sb)
for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
kfree(sbi->s_group_desc);
- kfree(sbi->s_debts);
+ kfree(sbi->s_bgi);
brelse(sbi->s_sbh);
/* Debugging code just in case the in-memory inode orphan list
@@ -901,6 +901,8 @@ static int ext3_check_descriptors (struct super_block * sb)
struct ext3_sb_info *sbi = EXT3_SB(sb);
unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
struct ext3_group_desc * gdp = NULL;
+ unsigned long total_free;
+ unsigned int reserved = le32_to_cpu(sbi->s_es->s_r_blocks_count);
int desc_block = 0;
int i;
@@ -947,6 +949,43 @@ static int ext3_check_descriptors (struct super_block * sb)
block += EXT3_BLOCKS_PER_GROUP(sb);
gdp++;
}
+
+ total_free = ext3_count_free_blocks(sb);
+ if (total_free != le32_to_cpu(EXT3_SB(sb)->s_es->s_free_blocks_count)) {
+ printk("EXT3-fs: invalid s_free_blocks_count %u (real %lu)\n",
+ le32_to_cpu(EXT3_SB(sb)->s_es->s_free_blocks_count),
+ total_free);
+ EXT3_SB(sb)->s_es->s_free_blocks_count = cpu_to_le32(total_free);
+ }
+
+ /* distribute reserved blocks over groups -bzzz */
+ for(i = sbi->s_groups_count - 1; reserved && total_free && i >= 0; i--) {
+ int free;
+
+ gdp = ext3_get_group_desc (sb, i, NULL);
+ if (!gdp) {
+ ext3_error (sb, "ext3_check_descriptors",
+ "cant get descriptor for group %d", i);
+ return 0;
+ }
+
+ free = le16_to_cpu(gdp->bg_free_blocks_count);
+ if (free > reserved)
+ free = reserved;
+ sbi->s_bgi[i].bg_reserved = free;
+ reserved -= free;
+ total_free -= free;
+ }
+
+ total_free = ext3_count_free_inodes(sb);
+ if (total_free != le32_to_cpu(EXT3_SB(sb)->s_es->s_free_inodes_count)) {
+ printk("EXT3-fs: invalid s_free_inodes_count %u (real %lu)\n",
+ le32_to_cpu(EXT3_SB(sb)->s_es->s_free_inodes_count),
+ total_free);
+ EXT3_SB(sb)->s_es->s_free_inodes_count = cpu_to_le32(total_free);
+ }
+
+
return 1;
}
@@ -1307,13 +1346,17 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
printk (KERN_ERR "EXT3-fs: not enough memory\n");
goto failed_mount;
}
- sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(*sbi->s_debts),
+ sbi->s_bgi = kmalloc(sbi->s_groups_count * sizeof(struct ext3_bg_info),
GFP_KERNEL);
- if (!sbi->s_debts) {
- printk ("EXT3-fs: not enough memory\n");
+ if (!sbi->s_bgi) {
+ printk("EXT3-fs: not enough memory to allocate s_bgi\n");
goto failed_mount2;
}
- memset(sbi->s_debts, 0, sbi->s_groups_count * sizeof(*sbi->s_debts));
+ memset(sbi->s_bgi, 0, sbi->s_groups_count * sizeof(struct ext3_bg_info));
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ spin_lock_init(&sbi->s_bgi[i].bg_balloc_lock);
+ spin_lock_init(&sbi->s_bgi[i].bg_ialloc_lock);
+ }
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logic_sb_block, i);
sbi->s_group_desc[i] = sb_bread(sb, block);
@@ -1329,7 +1372,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
goto failed_mount2;
}
sbi->s_gdb_count = db_count;
- sbi->s_dir_count = ext3_count_dirs(sb);
/*
* set up enough so that it can read an inode
*/
@@ -1432,8 +1474,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
failed_mount3:
journal_destroy(sbi->s_journal);
failed_mount2:
- if (sbi->s_debts)
- kfree(sbi->s_debts);
+ kfree(sbi->s_bgi);
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
kfree(sbi->s_group_desc);
@@ -1702,6 +1743,8 @@ static void ext3_commit_super (struct super_block * sb,
if (!sbh)
return;
es->s_wtime = cpu_to_le32(get_seconds());
+ es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
+ es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
BUFFER_TRACE(sbh, "marking dirty");
mark_buffer_dirty(sbh);
if (sync)
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index cfc728819a48..a8bf5b48dc23 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -48,9 +48,7 @@ EXPORT_SYMBOL(journal_get_create_access);
EXPORT_SYMBOL(journal_get_undo_access);
EXPORT_SYMBOL(journal_dirty_data);
EXPORT_SYMBOL(journal_dirty_metadata);
-#if 0
EXPORT_SYMBOL(journal_release_buffer);
-#endif
EXPORT_SYMBOL(journal_forget);
#if 0
EXPORT_SYMBOL(journal_sync_buffer);
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 473630d30d09..ea0cc114b133 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1106,7 +1106,6 @@ out_unlock:
return 0;
}
-#if 0
/*
* journal_release_buffer: undo a get_write_access without any buffer
* updates, if the update decided in the end that it didn't need access.
@@ -1140,7 +1139,6 @@ void journal_release_buffer (handle_t *handle, struct buffer_head *bh)
JBUFFER_TRACE(jh, "exit");
unlock_journal(journal);
}
-#endif
/**
* void journal_forget() - bforget() for potentially-journaled buffers.
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index c2f36c9d8022..277bf8cac2a4 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -344,7 +344,9 @@ struct ext3_inode {
#endif
#define ext3_set_bit ext2_set_bit
+#define ext3_set_bit_atomic ext2_set_bit_atomic
#define ext3_clear_bit ext2_clear_bit
+#define ext3_clear_bit_atomic ext2_clear_bit_atomic
#define ext3_test_bit ext2_test_bit
#define ext3_find_first_zero_bit ext2_find_first_zero_bit
#define ext3_find_next_zero_bit ext2_find_next_zero_bit
diff --git a/include/linux/ext3_fs_sb.h b/include/linux/ext3_fs_sb.h
index 19bf2e132343..6d53e5cac0ac 100644
--- a/include/linux/ext3_fs_sb.h
+++ b/include/linux/ext3_fs_sb.h
@@ -21,6 +21,13 @@
#include <linux/wait.h>
#endif
+struct ext3_bg_info {
+ u8 bg_debts;
+ spinlock_t bg_balloc_lock;
+ spinlock_t bg_ialloc_lock;
+ unsigned long bg_reserved;
+} ____cacheline_aligned_in_smp;
+
/*
* third extended-fs super-block data in memory
*/
@@ -50,8 +57,7 @@ struct ext3_sb_info {
u32 s_next_generation;
u32 s_hash_seed[4];
int s_def_hash_version;
- unsigned long s_dir_count;
- u8 *s_debts;
+ struct ext3_bg_info *s_bgi;
/* Journaling */
struct inode * s_journal_inode;
diff --git a/include/linux/ext3_jbd.h b/include/linux/ext3_jbd.h
index cb54b435a9db..60d7e40b609c 100644
--- a/include/linux/ext3_jbd.h
+++ b/include/linux/ext3_jbd.h
@@ -117,6 +117,12 @@ __ext3_journal_get_write_access(const char *where,
}
static inline void
+ext3_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
+{
+ journal_release_buffer(handle, bh);
+}
+
+static inline void
ext3_journal_forget(handle_t *handle, struct buffer_head *bh)
{
journal_forget(handle, bh);