[PATCH] reiserfs: data=ordered support

From: Chris Mason <mason@suse.com> reiserfs data=ordered support.
author: Andrew Morton <akpm@osdl.org> 2004-04-11 23:26:08 -0700
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2004-04-11 23:26:08 -0700
commit: bb0d96728fb63cf1d2294bb1dcafd60926e49cd5 (patch)
tree: b79a5619fa5ec271b3bd06f221065f57ecce6619
parent: 7c563ced265e3134a5c5c5b7ca2b31218993a204 (diff)
8 files changed, 803 insertions, 229 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index 605ce2099aa5..be9cc963a178 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1583,6 +1583,7 @@ int try_to_release_page(struct page *page, int gfp_mask)
 		return mapping->a_ops->releasepage(page, gfp_mask);
 	return try_to_free_buffers(page);
 }
+EXPORT_SYMBOL(try_to_release_page);
 
 /**
  * block_invalidatepage - invalidate part of all of a buffer-backed page
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 99321f2fcdf6..4b461667b231 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -9,6 +9,8 @@
 #include <asm/uaccess.h>
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
 
 /*
 ** We pack the tails of files on file close, not at the time they are written.
@@ -150,6 +152,7 @@ out:
    Maps all unmapped but prepared pages from the list.
    Updates metadata with newly allocated blocknumbers as needed */
 int reiserfs_allocate_blocks_for_region(
+				struct reiserfs_transaction_handle *th,
 				struct inode *inode, /* Inode we work with */
 				loff_t pos, /* Writing position */
 				int num_pages, /* number of pages write going
@@ -167,7 +170,6 @@ int reiserfs_allocate_blocks_for_region(
     struct cpu_key key; // cpu key of item that we are going to deal with
     struct item_head *ih; // pointer to item head that we are going to deal with
     struct buffer_head *bh; // Buffer head that contains items that we are going to deal with
-    struct reiserfs_transaction_handle th; // transaction handle for transaction we are going to create.
     __u32 * item; // pointer to item we are going to deal with
     INITIALIZE_PATH(path); // path to item, that we are going to deal with.
     b_blocknr_t allocated_blocks[blocks_to_allocate]; // Pointer to a place where allocated blocknumbers would be stored. Right now statically allocated, later that will change.
@@ -194,7 +196,7 @@ int reiserfs_allocate_blocks_for_region(
     /* If we came here, it means we absolutely need to open a transaction,
        since we need to allocate some blocks */
     reiserfs_write_lock(inode->i_sb); // Journaling stuff and we need that.
-    journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1); // Wish I know if this number enough
+    journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1); // Wish I know if this number enough
     reiserfs_update_inode_transaction(inode) ;
 
     /* Look for the in-tree position of our write, need path for block allocator */
@@ -206,7 +208,7 @@ int reiserfs_allocate_blocks_for_region(
    
     /* Allocate blocks */
     /* First fill in "hint" structure for block allocator */
-    hint.th = &th; // transaction handle.
+    hint.th = th; // transaction handle.
     hint.path = &path; // Path, so that block allocator can determine packing locality or whatever it needs to determine.
     hint.inode = inode; // Inode is needed by block allocator too.
     hint.search_start = 0; // We have no hint on where to search free blocks for block allocator.
@@ -222,7 +224,7 @@ int reiserfs_allocate_blocks_for_region(
 	    /* We flush the transaction in case of no space. This way some
 	       blocks might become free */
 	    SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
-	    restart_transaction(&th, inode, &path);
+	    restart_transaction(th, inode, &path);
 
 	    /* We might have scheduled, so search again */
 	    res = search_for_position_by_key(inode->i_sb, &key, &path);
@@ -296,7 +298,7 @@ int reiserfs_allocate_blocks_for_region(
 		    /* Ok, there is existing indirect item already. Need to append it */
 		    /* Calculate position past inserted item */
 		    make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3);
-		    res = reiserfs_paste_into_item( &th, &path, &key, (char *)zeros, UNFM_P_SIZE*to_paste);
+		    res = reiserfs_paste_into_item( th, &path, &key, (char *)zeros, UNFM_P_SIZE*to_paste);
 		    if ( res ) {
 			kfree(zeros);
 			goto error_exit_free_blocks;
@@ -326,7 +328,7 @@ int reiserfs_allocate_blocks_for_region(
 		        kfree(zeros);
 			goto error_exit_free_blocks;
 		    }
-		    res = reiserfs_insert_item( &th, &path, &key, &ins_ih, (char *)zeros);
+		    res = reiserfs_insert_item( th, &path, &key, &ins_ih, (char *)zeros);
 		} else {
 		    reiserfs_panic(inode->i_sb, "green-9011: Unexpected key type %K\n", &key);
 		}
@@ -336,8 +338,8 @@ int reiserfs_allocate_blocks_for_region(
 		}
 		/* Now we want to check if transaction is too full, and if it is
 		   we restart it. This will also free the path. */
-		if (journal_transaction_should_end(&th, th.t_blocks_allocated))
-		    restart_transaction(&th, inode, &path);
+		if (journal_transaction_should_end(th, th->t_blocks_allocated))
+		    restart_transaction(th, inode, &path);
 
 		/* Well, need to recalculate path and stuff */
 		set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + (to_paste << inode->i_blkbits));
@@ -368,7 +370,7 @@ retry:
 	       one. */
 	    /* First if we are already modifying current item, log it */
 	    if ( modifying_this_item ) {
-		journal_mark_dirty (&th, inode->i_sb, bh);
+		journal_mark_dirty (th, inode->i_sb, bh);
 		modifying_this_item = 0;
 	    }
 	    /* Then set the key to look for a new indirect item (offset of old
@@ -432,7 +434,7 @@ retry:
 
     if ( modifying_this_item ) { // We need to log last-accessed block, if it
 				 // was modified, but not logged yet.
-	journal_mark_dirty (&th, inode->i_sb, bh);
+	journal_mark_dirty (th, inode->i_sb, bh);
     }
 
     if ( curr_block < blocks_to_allocate ) {
@@ -443,7 +445,7 @@ retry:
 	    // position. We do not need to recalculate path as it should
 	    // already point to correct place.
 	    make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3);
-	    res = reiserfs_paste_into_item( &th, &path, &key, (char *)(allocated_blocks+curr_block), UNFM_P_SIZE*(blocks_to_allocate-curr_block));
+	    res = reiserfs_paste_into_item( th, &path, &key, (char *)(allocated_blocks+curr_block), UNFM_P_SIZE*(blocks_to_allocate-curr_block));
 	    if ( res ) {
 		goto error_exit_free_blocks;
 	    }
@@ -474,29 +476,18 @@ retry:
 		goto error_exit_free_blocks;
 	    }
 	    /* Insert item into the tree with the data as its body */
-	    res = reiserfs_insert_item( &th, &path, &key, &ins_ih, (char *)(allocated_blocks+curr_block));
+	    res = reiserfs_insert_item( th, &path, &key, &ins_ih, (char *)(allocated_blocks+curr_block));
 	} else {
 	    reiserfs_panic(inode->i_sb, "green-9010: unexpected item type for key %K\n",&key);
 	}
     }
 
-    /* Now the final thing, if we have grew the file, we must update it's size*/
-    if ( pos + write_bytes > inode->i_size) {
-	inode->i_size = pos + write_bytes; // Set new size
-	/* If the file have grown so much that tail packing is no longer possible, reset
-	   "need to pack" flag */
-	if ( (have_large_tails (inode->i_sb) && inode->i_size > i_block_size (inode)*4) ||
-	     (have_small_tails (inode->i_sb) && inode->i_size > i_block_size(inode)) )
-	    REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
-    }
-
-    /* Amount of on-disk blocks used by file have changed, update it */
+    // the caller is responsible for closing the transaction
+    // unless we return an error, they are also responsible for logging
+    // the inode.
+    //
     inode->i_blocks += blocks_to_allocate << (inode->i_blkbits - 9);
-    reiserfs_update_sd(&th, inode); // And update on-disk metadata
-    // finish all journal stuff now, We are not going to play with metadata
-    // anymore.
     pathrelse(&path);
-    journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1);
     reiserfs_write_unlock(inode->i_sb);
 
     // go through all the pages/buffers and map the buffers to newly allocated
@@ -527,6 +518,7 @@ retry:
 	    if ( !buffer_mapped(bh) ) { // Ok, unmapped buffer, need to map it
 		map_bh( bh, inode->i_sb, le32_to_cpu(allocated_blocks[curr_block]));
 		curr_block++;
+		set_buffer_new(bh);
 	    }
 	}
     }
@@ -540,10 +532,11 @@ error_exit_free_blocks:
     pathrelse(&path);
     // free blocks
     for( i = 0; i < blocks_to_allocate; i++ )
-	reiserfs_free_block( &th, le32_to_cpu(allocated_blocks[i]));
+	reiserfs_free_block(th, le32_to_cpu(allocated_blocks[i]));
 
 error_exit:
-    journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1);
+    reiserfs_update_sd(th, inode); // update any changes we made to blk count
+    journal_end(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1);
     reiserfs_write_unlock(inode->i_sb);
 
     return res;
@@ -603,12 +596,63 @@ int reiserfs_copy_from_user_to_file_region(
     return page_fault?-EFAULT:0;
 }
 
+/* taken fs/buffer.c:__block_commit_write */
+int reiserfs_commit_page(struct inode *inode, struct page *page,
+		unsigned from, unsigned to)
+{
+    unsigned block_start, block_end;
+    int partial = 0;
+    unsigned blocksize;
+    struct buffer_head *bh, *head;
+    unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
+    int new;
+
+    blocksize = 1 << inode->i_blkbits;
+
+    for(bh = head = page_buffers(page), block_start = 0;
+        bh != head || !block_start;
+	block_start=block_end, bh = bh->b_this_page)
+    {
+
+	new = buffer_new(bh);
+	clear_buffer_new(bh);
+	block_end = block_start + blocksize;
+	if (block_end <= from || block_start >= to) {
+	    if (!buffer_uptodate(bh))
+		    partial = 1;
+	} else {
+	    set_buffer_uptodate(bh);
+	    if (!buffer_dirty(bh)) {
+		mark_buffer_dirty(bh);
+		/* do data=ordered on any page past the end
+		 * of file and any buffer marked BH_New.
+		 */
+		if (reiserfs_data_ordered(inode->i_sb) &&
+		    (new || page->index >= i_size_index)) {
+		    reiserfs_add_ordered_list(inode, bh);
+	        }
+	    }
+	}
+    }
+
+    /*
+     * If this is a partial write which happened to make all buffers
+     * uptodate then we can optimize away a bogus readpage() for
+     * the next read(). Here we 'discover' whether the page went
+     * uptodate as a result of this (potentially partial) write.
+     */
+    if (!partial)
+	SetPageUptodate(page);
+    return 0;
+}
 
 
 /* Submit pages for write. This was separated from actual file copying
    because we might want to allocate block numbers in-between.
    This function assumes that caller will adjust file size to correct value. */
 int reiserfs_submit_file_region_for_write(
+				struct reiserfs_transaction_handle *th,
+				struct inode *inode,
 				loff_t pos, /* Writing position offset */
 				int num_pages, /* Number of pages to write */
 				int write_bytes, /* number of bytes to write */
@@ -619,12 +663,14 @@ int reiserfs_submit_file_region_for_write(
     int retval = 0; // Return value we are going to return.
     int i; // loop counter
     int offset; // Writing offset in page.
+    int orig_write_bytes = write_bytes;
+    int sd_update = 0;
 
     for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) {
 	int count = min_t(int,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page
 	struct page *page=prepared_pages[i]; // Current page we process.
 
-	status = block_commit_write(page, offset, offset+count);
+	status = reiserfs_commit_page(inode, page, offset, offset+count);
 	if ( status )
 	    retval = status; // To not overcomplicate matters We are going to
 			     // submit all the pages even if there was error.
@@ -636,6 +682,41 @@ int reiserfs_submit_file_region_for_write(
 			  // to grab_cache_page
 	page_cache_release(page);
     }
+    /* now that we've gotten all the ordered buffers marked dirty,
+     * we can safely update i_size and close any running transaction
+     */
+    if ( pos + orig_write_bytes > inode->i_size) {
+	inode->i_size = pos + orig_write_bytes; // Set new size
+	/* If the file have grown so much that tail packing is no
+	 * longer possible, reset "need to pack" flag */
+	if ( (have_large_tails (inode->i_sb) &&
+	      inode->i_size > i_block_size (inode)*4) ||
+	     (have_small_tails (inode->i_sb) &&
+	     inode->i_size > i_block_size(inode)) )
+	    REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
+        else if ( (have_large_tails (inode->i_sb) &&
+	          inode->i_size < i_block_size (inode)*4) ||
+	          (have_small_tails (inode->i_sb) &&
+		  inode->i_size < i_block_size(inode)) )
+	    REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
+
+	if (th->t_trans_id) {
+	    reiserfs_write_lock(inode->i_sb);
+	    reiserfs_update_sd(th, inode); // And update on-disk metadata
+	    reiserfs_write_unlock(inode->i_sb);
+	} else
+	    inode->i_sb->s_op->dirty_inode(inode);
+
+        sd_update = 1;
+    }
+    if (th->t_trans_id) {
+	reiserfs_write_lock(inode->i_sb);
+	if (!sd_update)
+	    reiserfs_update_sd(th, inode);
+	journal_end(th, th->t_super, th->t_blocks_allocated);
+	reiserfs_write_unlock(inode->i_sb);
+    }
+    th->t_trans_id = 0;
     return retval;
 }
 
@@ -1003,19 +1084,18 @@ ssize_t reiserfs_file_write( struct file *file, /* the file we are going to writ
     loff_t pos; // Current position in the file.
     size_t res; // return value of various functions that we call.
     struct inode *inode = file->f_dentry->d_inode; // Inode of the file that we are writing to.
-    struct page * prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
 				/* To simplify coding at this time, we store
 				   locked pages in array for now */
-    if ( count <= PAGE_CACHE_SIZE )
-        return generic_file_write(file, buf, count, ppos);
+    struct page * prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
+    struct reiserfs_transaction_handle th;
+    th.t_trans_id = 0;
 
-    if ( file->f_flags & O_DIRECT) { // Direct IO needs some special threating.
+    if ( file->f_flags & O_DIRECT) { // Direct IO needs treatment
 	int result, after_file_end = 0;
 	if ( (*ppos + count >= inode->i_size) || (file->f_flags & O_APPEND) ) {
 	    /* If we are appending a file, we need to put this savelink in here.
 	       If we will crash while doing direct io, finish_unfinished will
 	       cut the garbage from the file end. */
-	    struct reiserfs_transaction_handle th;
 	    reiserfs_write_lock(inode->i_sb);
 	    journal_begin(&th, inode->i_sb,  JOURNAL_PER_BALANCE_CNT );
 	    reiserfs_update_inode_transaction(inode);
@@ -1040,7 +1120,6 @@ ssize_t reiserfs_file_write( struct file *file, /* the file we are going to writ
 	return result;
     }
 
-
     if ( unlikely((ssize_t) count < 0 ))
         return -EINVAL;
 
@@ -1146,11 +1225,7 @@ ssize_t reiserfs_file_write( struct file *file, /* the file we are going to writ
 
 	if ( blocks_to_allocate > 0) {/*We only allocate blocks if we need to*/
 	    /* Fill in all the possible holes and append the file if needed */
-	    res = reiserfs_allocate_blocks_for_region(inode, pos, num_pages, write_bytes, prepared_pages, blocks_to_allocate);
-	} else if ( pos + write_bytes > inode->i_size ) {
-	    /* File might have grown even though no new blocks were added */
-	    inode->i_size = pos + write_bytes;
-	    inode->i_sb->s_op->dirty_inode(inode);
+	    res = reiserfs_allocate_blocks_for_region(&th, inode, pos, num_pages, write_bytes, prepared_pages, blocks_to_allocate);
 	}
 
 	/* well, we have allocated the blocks, so it is time to free
@@ -1173,7 +1248,8 @@ ssize_t reiserfs_file_write( struct file *file, /* the file we are going to writ
 	}
 
 	/* Send the pages to disk and unlock them. */
-	res = reiserfs_submit_file_region_for_write(pos, num_pages, write_bytes, prepared_pages);
+	res = reiserfs_submit_file_region_for_write(&th, inode, pos, num_pages,
+	                                            write_bytes,prepared_pages);
 	if ( res )
 	    break;
 
@@ -1184,10 +1260,17 @@ ssize_t reiserfs_file_write( struct file *file, /* the file we are going to writ
 	balance_dirty_pages_ratelimited(inode->i_mapping);
     }
 
+    /* this is only true on error */
+    if (th.t_trans_id) {
+        reiserfs_write_lock(inode->i_sb);
+	journal_end(&th, th.t_super, th.t_blocks_allocated);
+        reiserfs_write_unlock(inode->i_sb);
+    }
     if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
 	res = generic_osync_inode(inode, file->f_mapping, OSYNC_METADATA|OSYNC_DATA);
 
     up(&inode->i_sem);
+    reiserfs_async_progress_wait(inode->i_sb);
     return (already_written != 0)?already_written:res;
 
 out:
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 06635c7f18a9..cf88e52a2cfc 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -22,9 +22,12 @@ extern int reiserfs_default_io_size; /* default io size devuned in super.c */
 #define GET_BLOCK_NO_HOLE 2   /* return -ENOENT for file holes */
 #define GET_BLOCK_READ_DIRECT 4  /* read the tail if indirect item not found */
 #define GET_BLOCK_NO_ISEM     8 /* i_sem is not held, don't preallocate */
+#define GET_BLOCK_NO_DANGLE   16 /* don't leave any transactions running */
 
 static int reiserfs_get_block (struct inode * inode, sector_t block,
 			       struct buffer_head * bh_result, int create);
+static int reiserfs_commit_write(struct file *f, struct page *page,
+                                 unsigned from, unsigned to);
 
 void reiserfs_delete_inode (struct inode * inode)
 {
@@ -103,12 +106,6 @@ inline void make_le_item_head (struct item_head * ih, const struct cpu_key * key
     put_ih_entry_count( ih, entry_count );
 }
 
-static void add_to_flushlist(struct inode *inode, struct buffer_head *bh) {
-    struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
-
-    buffer_insert_list(&j->j_dirty_buffers_lock, bh, &j->j_dirty_buffers) ;
-}
-
 //
 // FIXME: we might cache recently accessed indirect item
 
@@ -437,7 +434,8 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode,
        reiserfs_get_block() */
     bh_result->b_size = (1 << inode->i_blkbits);
 
-    ret = reiserfs_get_block(inode, iblock, bh_result, create) ;
+    ret = reiserfs_get_block(inode, iblock, bh_result,
+                             create | GET_BLOCK_NO_DANGLE) ;
 
     /* don't allow direct io onto tail pages */
     if (ret == 0 && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
@@ -510,15 +508,14 @@ static int convert_tail_for_hole(struct inode *inode,
     ** won't trigger a get_block in this case.
     */
     fix_tail_page_for_writing(tail_page) ;
-    retval = block_prepare_write(tail_page, tail_start, tail_end, 
-                                 reiserfs_get_block) ; 
+    retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
     if (retval)
         goto unlock ;
 
     /* tail conversion might change the data in the page */
     flush_dcache_page(tail_page) ;
 
-    retval = generic_commit_write(NULL, tail_page, tail_start, tail_end) ;
+    retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end) ;
 
 unlock:
     if (tail_page != hole_page) {
@@ -557,7 +554,7 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
     __u32 * item;
     int done;
     int fs_gen;
-    struct reiserfs_transaction_handle th ;
+    struct reiserfs_transaction_handle *th = NULL;
     /* space reserved in transaction batch: 
         . 3 balancings in direct->indirect conversion
         . 1 block involved into reiserfs_update_sd()
@@ -565,12 +562,11 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
        can incur (much) more that 3 balancings. */
     int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1;
     int version;
-    int transaction_started = 0 ;
+    int dangle = 1;
     loff_t new_offset = (((loff_t)block) << inode->i_sb->s_blocksize_bits) + 1 ;
 
 				/* bad.... */
     reiserfs_write_lock(inode->i_sb);
-    th.t_trans_id = 0 ;
     version = get_inode_item_key_version (inode);
 
     if (block < 0) {
@@ -594,6 +590,13 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 	reiserfs_write_unlock(inode->i_sb);
 	return ret;
     }
+    /*
+     * if we're already in a transaction, make sure to close
+     * any new transactions we start in this func
+     */
+    if ((create & GET_BLOCK_NO_DANGLE) ||
+        reiserfs_transaction_running(inode->i_sb))
+        dangle = 0;
 
     /* If file is of such a size, that it might have a tail and tails are enabled
     ** we should mark it as possibly needing tail packing on close
@@ -606,9 +609,13 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
     make_cpu_key (&key, inode, new_offset,
 		  TYPE_ANY, 3/*key length*/);
     if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
-	journal_begin(&th, inode->i_sb, jbegin_count) ;
+start_trans:
+	th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
+	if (!th) {
+	    retval = -ENOMEM;
+	    goto failure;
+	}
 	reiserfs_update_inode_transaction(inode) ;
-	transaction_started = 1 ;
     }
  research:
 
@@ -628,23 +635,21 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 
     if (allocation_needed (retval, allocated_block_nr, ih, item, pos_in_item)) {
 	/* we have to allocate block for the unformatted node */
-	if (!transaction_started) {
+	if (!th) {
 	    pathrelse(&path) ;
-	    journal_begin(&th, inode->i_sb, jbegin_count) ;
-	    reiserfs_update_inode_transaction(inode) ;
-	    transaction_started = 1 ;
-	    goto research ;
+	    goto start_trans;
 	}
 
-	repeat = _allocate_block(&th, block, inode, &allocated_block_nr, &path, create);
+	repeat = _allocate_block(th, block, inode, &allocated_block_nr, &path, create);
 
 	if (repeat == NO_DISK_SPACE) {
 	    /* restart the transaction to give the journal a chance to free
 	    ** some blocks.  releases the path, so we have to go back to
 	    ** research if we succeed on the second try
 	    */
-	    restart_transaction(&th, inode, &path) ; 
-	    repeat = _allocate_block(&th, block, inode, &allocated_block_nr, NULL, create);
+	    SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
+	    restart_transaction(th, inode, &path) ;
+	    repeat = _allocate_block(th, block, inode, &allocated_block_nr, NULL, create);
 
 	    if (repeat != NO_DISK_SPACE) {
 		goto research ;
@@ -672,16 +677,18 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 		goto research;
 	    }
 	    set_buffer_new(bh_result);
+	    if (buffer_dirty(bh_result) && reiserfs_data_ordered(inode->i_sb))
+	    	reiserfs_add_ordered_list(inode, bh_result);
 	    put_block_num(item, pos_in_item, allocated_block_nr) ;
             unfm_ptr = allocated_block_nr;
-	    journal_mark_dirty (&th, inode->i_sb, bh);
+	    journal_mark_dirty (th, inode->i_sb, bh);
 	    inode->i_blocks += (inode->i_sb->s_blocksize / 512) ;
-	    reiserfs_update_sd(&th, inode) ;
+	    reiserfs_update_sd(th, inode) ;
 	}
 	set_block_dev_mapped(bh_result, unfm_ptr, inode);
 	pathrelse (&path);
-	if (transaction_started)
-	    journal_end(&th, inode->i_sb, jbegin_count) ;
+	if (!dangle && th)
+	    reiserfs_end_persistent_transaction(th);
 
 	reiserfs_write_unlock(inode->i_sb);
 	 
@@ -692,16 +699,9 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 	return 0;
     }
 
-    if (!transaction_started) {
-	/* if we don't pathrelse, we could vs-3050 on the buffer if
-	** someone is waiting for it (they can't finish until the buffer
-	** is released, we can start a new transaction until they finish)
-	*/
+    if (!th) {
 	pathrelse(&path) ;
-	journal_begin(&th, inode->i_sb, jbegin_count) ;
-	reiserfs_update_inode_transaction(inode) ;
-	transaction_started = 1 ;
-	goto research;
+	goto start_trans;
     }
 
     /* desired position is not found or is in the direct item. We have
@@ -729,9 +729,9 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 	    set_cpu_key_k_offset (&tmp_key, 1);
 	    PATH_LAST_POSITION(&path) ++;
 
-	    retval = reiserfs_insert_item (&th, &path, &tmp_key, &tmp_ih, (char *)&unp);
+	    retval = reiserfs_insert_item (th, &path, &tmp_key, &tmp_ih, (char *)&unp);
 	    if (retval) {
-		reiserfs_free_block (&th, allocated_block_nr);
+		reiserfs_free_block (th, allocated_block_nr);
 		goto failure; // retval == -ENOSPC or -EIO or -EEXIST
 	    }
 	    if (unp)
@@ -755,8 +755,14 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 		   node. FIXME: this should also get into page cache */
 
 		pathrelse(&path) ;
-		journal_end(&th, inode->i_sb, jbegin_count) ;
-		transaction_started = 0 ;
+		/*
+		 * ugly, but we can only end the transaction if
+		 * we aren't nested
+		 */
+		if (th->t_refcount == 1) {
+		    reiserfs_end_persistent_transaction(th);
+		    th = NULL;
+		}
 
 		retval = convert_tail_for_hole(inode, bh_result, tail_offset) ;
 		if (retval) {
@@ -764,18 +770,19 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 			printk("clm-6004: convert tail failed inode %lu, error %d\n", inode->i_ino, retval) ;
 		    if (allocated_block_nr) {
 			/* the bitmap, the super, and the stat data == 3 */
-			journal_begin(&th, inode->i_sb, 3) ;
-			reiserfs_free_block (&th, allocated_block_nr);
-			transaction_started = 1 ;
+			if (!th)
+			    th = reiserfs_persistent_transaction(inode->i_sb,3);
+			if (th)
+			    reiserfs_free_block (th, allocated_block_nr);
 		    }
 		    goto failure ;
 		}
 		goto research ;
 	    }
-	    retval = direct2indirect (&th, inode, &path, unbh, tail_offset);
+	    retval = direct2indirect (th, inode, &path, unbh, tail_offset);
 	    if (retval) {
 		reiserfs_unmap_buffer(unbh);
-		reiserfs_free_block (&th, allocated_block_nr);
+		reiserfs_free_block (th, allocated_block_nr);
 		goto failure;
 	    }
 	    /* it is important the set_buffer_uptodate is done after
@@ -795,7 +802,7 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 		/* we've converted the tail, so we must
 		** flush unbh before the transaction commits
 		*/
-		add_to_flushlist(inode, unbh) ;
+		reiserfs_add_tail_list(inode, unbh) ;
 
 		/* mark it dirty now to prevent commit_write from adding
 		** this buffer to the inode's dirty buffer list
@@ -858,13 +865,13 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 		   only have space for one block */
 		blocks_needed=max_to_insert?max_to_insert:1;
 	    }
-	    retval = reiserfs_paste_into_item (&th, &path, &tmp_key, (char *)un, UNFM_P_SIZE * blocks_needed);
+	    retval = reiserfs_paste_into_item (th, &path, &tmp_key, (char *)un, UNFM_P_SIZE * blocks_needed);
 
 	    if (blocks_needed != 1)
 		kfree(un);
 
 	    if (retval) {
-		reiserfs_free_block (&th, allocated_block_nr);
+		reiserfs_free_block (th, allocated_block_nr);
 		goto failure;
 	    }
 	    if (done) {
@@ -889,8 +896,8 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 	** release the path so that anybody waiting on the path before
 	** ending their transaction will be able to continue.
 	*/
-	if (journal_transaction_should_end(&th, th.t_blocks_allocated)) {
-	  restart_transaction(&th, inode, &path) ; 
+	if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
+	  restart_transaction(th, inode, &path) ;
 	}
 	/* inserting indirect pointers for a hole can take a 
 	** long time.  reschedule if needed
@@ -907,7 +914,7 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 			      "%K should not be found\n", &key);
 	    retval = -EEXIST;
 	    if (allocated_block_nr)
-	        reiserfs_free_block (&th, allocated_block_nr);
+	        reiserfs_free_block (th, allocated_block_nr);
 	    pathrelse(&path) ;
 	    goto failure;
 	}
@@ -921,9 +928,9 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
     retval = 0;
 
  failure:
-    if (transaction_started) {
-      reiserfs_update_sd(&th, inode) ;
-      journal_end(&th, inode->i_sb, jbegin_count) ;
+    if (th && !dangle) {
+      reiserfs_update_sd(th, inode) ;
+      reiserfs_end_persistent_transaction(th);
     }
     reiserfs_write_unlock(inode->i_sb);
     reiserfs_check_path(&path) ;
@@ -2007,7 +2014,8 @@ out:
     /* this is where we fill in holes in the file. */
     if (use_get_block) {
 	retval = reiserfs_get_block(inode, block, bh_result, 
-	                            GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM) ;
+	                            GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM |
+				    GET_BLOCK_NO_DANGLE);
 	if (!retval) {
 	    if (!buffer_mapped(bh_result) || bh_result->b_blocknr == 0) {
 	        /* get_block failed to find a mapped unformatted node. */
@@ -2219,13 +2227,43 @@ static int reiserfs_writepage (struct page * page, struct writeback_control *wbc
     return reiserfs_write_full_page(page, wbc) ;
 }
 
-
 int reiserfs_prepare_write(struct file *f, struct page *page, 
 			   unsigned from, unsigned to) {
     struct inode *inode = page->mapping->host ;
+    int ret;
+    int old_ref = 0;
+
     reiserfs_wait_on_write_block(inode->i_sb) ;
     fix_tail_page_for_writing(page) ;
-    return block_prepare_write(page, from, to, reiserfs_get_block) ;
+    if (reiserfs_transaction_running(inode->i_sb)) {
+	struct reiserfs_transaction_handle *th;
+        th = (struct reiserfs_transaction_handle *)current->journal_info;
+	old_ref = th->t_refcount;
+	th->t_refcount++;
+    }
+
+    ret = block_prepare_write(page, from, to, reiserfs_get_block) ;
+    if (ret && reiserfs_transaction_running(inode->i_sb)) {
+    	struct reiserfs_transaction_handle *th = current->journal_info;
+	/* this gets a little ugly.  If reiserfs_get_block returned an
+	 * error and left a transacstion running, we've got to close it,
+	 * and we've got to free handle if it was a persistent transaction.
+	 *
+	 * But, if we had nested into an existing transaction, we need
+	 * to just drop the ref count on the handle.
+	 *
+	 * If old_ref == 0, the transaction is from reiserfs_get_block,
+	 * and it was a persistent trans.  Otherwise, it was nested above.
+	 */
+	if (th->t_refcount > old_ref) {
+	    if (old_ref)
+	    	th->t_refcount--;
+	    else
+		reiserfs_end_persistent_transaction(th);
+	}
+    }
+    return ret;
+
 }
 
 
@@ -2237,16 +2275,21 @@ static int reiserfs_commit_write(struct file *f, struct page *page,
                                  unsigned from, unsigned to) {
     struct inode *inode = page->mapping->host ;
     loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-    int ret ; 
+    int ret = 0;
+    struct reiserfs_transaction_handle *th = NULL;
     
     reiserfs_wait_on_write_block(inode->i_sb) ;
+    if (reiserfs_transaction_running(inode->i_sb)) {
+        th = current->journal_info;
+    }
+    reiserfs_commit_page(inode, page, from, to);
  
     /* generic_commit_write does this for us, but does not update the
     ** transaction tracking stuff when the size changes.  So, we have
     ** to do the i_size updates here.
     */
     if (pos > inode->i_size) {
-	struct reiserfs_transaction_handle th ;
+	struct reiserfs_transaction_handle myth ;
 	reiserfs_write_lock(inode->i_sb);
 	/* If the file have grown beyond the border where it
 	   can have a tail, unmark it as needing a tail
@@ -2255,16 +2298,19 @@ static int reiserfs_commit_write(struct file *f, struct page *page,
 	     (have_small_tails (inode->i_sb) && inode->i_size > i_block_size(inode)) )
 	    REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
 
-	journal_begin(&th, inode->i_sb, 1) ;
+	journal_begin(&myth, inode->i_sb, 1) ;
 	reiserfs_update_inode_transaction(inode) ;
 	inode->i_size = pos ;
-	reiserfs_update_sd(&th, inode) ;
-	journal_end(&th, inode->i_sb, 1) ;
+	reiserfs_update_sd(&myth, inode) ;
+	journal_end(&myth, inode->i_sb, 1) ;
+	reiserfs_write_unlock(inode->i_sb);
+    }
+    if (th) {
+	reiserfs_write_lock(inode->i_sb);
+        reiserfs_end_persistent_transaction(th);
 	reiserfs_write_unlock(inode->i_sb);
     }
  
-    ret = generic_commit_write(f, page, from, to) ;
-
     /* we test for O_SYNC here so we can commit the transaction
     ** for any packed tails the file might have had
     */
@@ -2324,16 +2370,110 @@ void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs )
 	}
 }
 
+/* decide if this buffer needs to stay around for data logging or ordered
+** write purposes
+*/
+static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
+{
+    int ret = 1 ;
+    struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
+
+    spin_lock(&j->j_dirty_buffers_lock) ;
+    if (!buffer_mapped(bh)) {
+        goto free_jh;
+    }
+    /* the page is locked, and the only places that log a data buffer
+     * also lock the page.
+     */
+#if 0
+    if (reiserfs_file_data_log(inode)) {
+	/* very conservative, leave the buffer pinned if anyone might need it.
+	** this should be changed to drop the buffer if it is only in the
+	** current transaction
+	*/
+        if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
+	    ret = 0 ;
+	}
+    } else
+#endif
+    if (buffer_dirty(bh) || buffer_locked(bh)) {
+	struct reiserfs_journal_list *jl;
+	struct reiserfs_jh *jh = bh->b_private;
+
+	/* why is this safe?
+	 * reiserfs_setattr updates i_size in the on disk
+	 * stat data before allowing vmtruncate to be called.
+	 *
+	 * If buffer was put onto the ordered list for this
+	 * transaction, we know for sure either this transaction
+	 * or an older one already has updated i_size on disk,
+	 * and this ordered data won't be referenced in the file
+	 * if we crash.
+	 *
+	 * if the buffer was put onto the ordered list for an older
+	 * transaction, we need to leave it around
+	 */
+	if (jh && (jl = jh->jl) && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
+	    ret = 0;
+    }
+free_jh:
+    if (ret && bh->b_private) {
+        reiserfs_free_jh(bh);
+    }
+    spin_unlock(&j->j_dirty_buffers_lock) ;
+    return ret ;
+}
+
+/* clm -- taken from fs/buffer.c:block_invalidate_page */
+static int reiserfs_invalidatepage(struct page *page, unsigned long offset)
+{
+    struct buffer_head *head, *bh, *next;
+    struct inode *inode = page->mapping->host;
+    unsigned int curr_off = 0;
+    int ret = 1;
+
+    BUG_ON(!PageLocked(page));
+    if (!page_has_buffers(page))
+	goto out;
+
+    head = page_buffers(page);
+    bh = head;
+    do {
+	unsigned int next_off = curr_off + bh->b_size;
+	next = bh->b_this_page;
+
+	/*
+	 * is this block fully invalidated?
+	 */
+	if (offset <= curr_off) {
+	    if (invalidatepage_can_drop(inode, bh))
+		reiserfs_unmap_buffer(bh);
+	    else
+	        ret = 0;
+	}
+	curr_off = next_off;
+	bh = next;
+    } while (bh != head);
+
+    /*
+     * We release buffers only if the entire page is being invalidated.
+     * The get_block cached value has been unconditionally invalidated,
+     * so real IO is not possible anymore.
+     */
+    if (!offset && ret)
+	ret = try_to_release_page(page, 0);
+out:
+    return ret;
+}
+
 /*
  * Returns 1 if the page's buffers were dropped.  The page is locked.
  *
  * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
  * in the buffers at page_buffers(page).
  *
- * FIXME: Chris says the buffer list is not used with `mount -o notail',
- * so in that case the fs can avoid the extra locking.  Create a second
- * address_space_operations with a NULL ->releasepage and install that
- * into new address_spaces.
+ * even in -o notail mode, we can't be sure an old mount without -o notail
+ * didn't create files with tails.
  */
 static int reiserfs_releasepage(struct page *page, int unused_gfp_flags)
 {
@@ -2347,11 +2487,13 @@ static int reiserfs_releasepage(struct page *page, int unused_gfp_flags)
     head = page_buffers(page) ;
     bh = head ;
     do {
-	if (!buffer_dirty(bh) && !buffer_locked(bh)) {
-		list_del_init(&bh->b_assoc_buffers) ;
-	} else {
+	if (bh->b_private) {
+	    if (!buffer_dirty(bh) && !buffer_locked(bh)) {
+		reiserfs_free_jh(bh);
+	    } else {
 		ret = 0 ;
 		break ;
+	    }
 	}
 	bh = bh->b_this_page ;
     } while (bh != head) ;
@@ -2379,6 +2521,7 @@ struct address_space_operations reiserfs_address_space_operations = {
     .readpage = reiserfs_readpage, 
     .readpages = reiserfs_readpages, 
     .releasepage = reiserfs_releasepage,
+    .invalidatepage = reiserfs_invalidatepage,
     .sync_page = block_sync_page,
     .prepare_write = reiserfs_prepare_write,
     .commit_write = reiserfs_commit_write,
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 9d7a018c366f..ec59e074416a 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -92,6 +92,7 @@ int reiserfs_unpack (struct inode * inode, struct file * filp)
     int retval = 0;
     int index ;
     struct page *page ;
+    struct address_space *mapping ;
     unsigned long write_from ;
     unsigned long blocksize = inode->i_sb->s_blocksize ;
     	
@@ -122,17 +123,19 @@ int reiserfs_unpack (struct inode * inode, struct file * filp)
     ** reiserfs_get_block to unpack the tail for us.
     */
     index = inode->i_size >> PAGE_CACHE_SHIFT ;
-    page = grab_cache_page(inode->i_mapping, index) ;
+    mapping = inode->i_mapping ;
+    page = grab_cache_page(mapping, index) ;
     retval = -ENOMEM;
     if (!page) {
         goto out ;
     }
-    retval = reiserfs_prepare_write(NULL, page, write_from, blocksize) ;
+    retval = mapping->a_ops->prepare_write(NULL, page, write_from, write_from) ;
     if (retval)
         goto out_unlock ;
 
     /* conversion can change page contents, must flush */
     flush_dcache_page(page) ;
+    retval = mapping->a_ops->commit_write(NULL, page, write_from, write_from) ;
     REISERFS_I(inode)->i_flags |= i_nopack_mask;
 
 out_unlock:
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index cfff6ec0871f..17278f415916 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -54,6 +54,7 @@
 #include <linux/buffer_head.h>
 #include <linux/workqueue.h>
 #include <linux/writeback.h>
+#include <linux/blkdev.h>
 
 
 /* gets a struct reiserfs_journal_list * from a list head */
@@ -595,6 +596,248 @@ static int journal_list_still_alive(struct super_block *s,
     return 0;
 }
 
+static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) {
+    char b[BDEVNAME_SIZE];
+
+    if (buffer_journaled(bh)) {
+        reiserfs_warning("clm-2084: pinned buffer %lu:%s sent to disk\n",
+	                 bh->b_blocknr, bdevname(bh->b_bdev, b)) ;
+    }
+    if (uptodate)
+    	set_buffer_uptodate(bh) ;
+    else
+    	clear_buffer_uptodate(bh) ;
+    unlock_buffer(bh) ;
+    put_bh(bh) ;
+}
+
+static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate) {
+    if (uptodate)
+    	set_buffer_uptodate(bh) ;
+    else
+    	clear_buffer_uptodate(bh) ;
+    unlock_buffer(bh) ;
+    put_bh(bh) ;
+}
+
+static void submit_logged_buffer(struct buffer_head *bh) {
+    get_bh(bh) ;
+    bh->b_end_io = reiserfs_end_buffer_io_sync ;
+    mark_buffer_notjournal_new(bh) ;
+    clear_buffer_dirty(bh) ;
+    if (!test_and_clear_bit(BH_JTest, &bh->b_state))
+        BUG();
+    if (!buffer_uptodate(bh))
+        BUG();
+    submit_bh(WRITE, bh) ;
+}
+
+static void submit_ordered_buffer(struct buffer_head *bh) {
+    get_bh(bh) ;
+    bh->b_end_io = reiserfs_end_ordered_io;
+    clear_buffer_dirty(bh) ;
+    if (!buffer_uptodate(bh))
+        BUG();
+    submit_bh(WRITE, bh) ;
+}
+
+#define CHUNK_SIZE 32
+struct buffer_chunk {
+    struct buffer_head *bh[CHUNK_SIZE];
+    int nr;
+};
+
+static void write_chunk(struct buffer_chunk *chunk) {
+    int i;
+    for (i = 0; i < chunk->nr ; i++) {
+	submit_logged_buffer(chunk->bh[i]) ;
+    }
+    chunk->nr = 0;
+}
+
+static void write_ordered_chunk(struct buffer_chunk *chunk) {
+    int i;
+    for (i = 0; i < chunk->nr ; i++) {
+	submit_ordered_buffer(chunk->bh[i]) ;
+    }
+    chunk->nr = 0;
+}
+
+static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
+			 spinlock_t *lock,
+			 void (fn)(struct buffer_chunk *))
+{
+    int ret = 0;
+    if (chunk->nr >= CHUNK_SIZE)
+        BUG();
+    chunk->bh[chunk->nr++] = bh;
+    if (chunk->nr >= CHUNK_SIZE) {
+	ret = 1;
+        if (lock)
+	    spin_unlock(lock);
+        fn(chunk);
+        if (lock)
+	    spin_lock(lock);
+    }
+    return ret;
+}
+
+
+atomic_t nr_reiserfs_jh = ATOMIC_INIT(0);
+static struct reiserfs_jh *alloc_jh(void) {
+    struct reiserfs_jh *jh;
+    while(1) {
+	jh = kmalloc(sizeof(*jh), GFP_NOFS);
+	if (jh) {
+	    atomic_inc(&nr_reiserfs_jh);
+	    return jh;
+	}
+        yield();
+    }
+}
+
+/*
+ * we want to free the jh when the buffer has been written
+ * and waited on
+ */
+void reiserfs_free_jh(struct buffer_head *bh) {
+    struct reiserfs_jh *jh;
+
+    jh = bh->b_private;
+    if (jh) {
+	bh->b_private = NULL;
+	jh->bh = NULL;
+	list_del_init(&jh->list);
+	kfree(jh);
+	if (atomic_read(&nr_reiserfs_jh) <= 0)
+	    BUG();
+	atomic_dec(&nr_reiserfs_jh);
+	put_bh(bh);
+    }
+}
+
+static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
+                           int tail)
+{
+    struct reiserfs_jh *jh;
+
+    if (bh->b_private) {
+	spin_lock(&j->j_dirty_buffers_lock);
+	if (!bh->b_private) {
+	    spin_unlock(&j->j_dirty_buffers_lock);
+	    goto no_jh;
+	}
+        jh = bh->b_private;
+	list_del_init(&jh->list);
+    } else {
+no_jh:
+	get_bh(bh);
+	jh = alloc_jh();
+	spin_lock(&j->j_dirty_buffers_lock);
+	/* buffer must be locked for __add_jh, should be able to have
+	 * two adds at the same time
+	 */
+	if (bh->b_private)
+	    BUG();
+	jh->bh = bh;
+	bh->b_private = jh;
+    }
+    jh->jl = j->j_current_jl;
+    if (tail)
+	list_add_tail(&jh->list, &jh->jl->j_tail_bh_list);
+    else {
+	list_add_tail(&jh->list, &jh->jl->j_bh_list);
+    }
+    spin_unlock(&j->j_dirty_buffers_lock);
+    return 0;
+}
+
+int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh) {
+    return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1);
+}
+int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh) {
+    return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0);
+}
+
+#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list)
+static int write_ordered_buffers(spinlock_t *lock,
+				 struct reiserfs_journal *j,
+                                 struct reiserfs_journal_list *jl,
+				 struct list_head *list)
+{
+    struct buffer_head *bh;
+    struct reiserfs_jh *jh;
+    int ret = 0;
+    struct buffer_chunk chunk;
+    struct list_head tmp;
+    INIT_LIST_HEAD(&tmp);
+
+    chunk.nr = 0;
+    spin_lock(lock);
+    while(!list_empty(list)) {
+        jh = JH_ENTRY(list->next);
+	bh = jh->bh;
+	get_bh(bh);
+	if (test_set_buffer_locked(bh)) {
+	    if (!buffer_dirty(bh)) {
+		list_del_init(&jh->list);
+		list_add(&jh->list, &tmp);
+		goto loop_next;
+	    }
+	    spin_unlock(lock);
+	    if (chunk.nr)
+		write_ordered_chunk(&chunk);
+	    wait_on_buffer(bh);
+	    if (need_resched)
+	        schedule();
+	    spin_lock(lock);
+	    goto loop_next;
+	}
+	if (buffer_dirty(bh)) {
+	    list_del_init(&jh->list);
+	    list_add(&jh->list, &tmp);
+	    add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
+	} else {
+	    reiserfs_free_jh(bh);
+	    unlock_buffer(bh);
+	}
+loop_next:
+	put_bh(bh);
+	if (chunk.nr == 0 && need_resched) {
+	    spin_unlock(lock);
+	    schedule();
+	    spin_lock(lock);
+	}
+    }
+    if (chunk.nr) {
+	spin_unlock(lock);
+        write_ordered_chunk(&chunk);
+	spin_lock(lock);
+    }
+    while(!list_empty(&tmp)) {
+        jh = JH_ENTRY(tmp.prev);
+	bh = jh->bh;
+	get_bh(bh);
+	reiserfs_free_jh(bh);
+
+	if (buffer_locked(bh)) {
+	    spin_unlock(lock);
+	    wait_on_buffer(bh);
+	    spin_lock(lock);
+	}
+	if (!buffer_uptodate(bh))
+	    ret = -EIO;
+	put_bh(bh);
+	if (need_resched()) {
+	    spin_unlock(lock);
+	    schedule();
+	    spin_lock(lock);
+	}
+    }
+    spin_unlock(lock);
+    return ret;
+}
+
 static int flush_older_commits(struct super_block *s, struct reiserfs_journal_list *jl) {
     struct reiserfs_journal_list *other_jl;
     struct reiserfs_journal_list *first_jl;
@@ -656,6 +899,13 @@ find_first:
     }
     return 0;
 }
+int reiserfs_async_progress_wait(struct super_block *s) {
+    DEFINE_WAIT(wait);
+    struct reiserfs_journal *j = SB_JOURNAL(s);
+    if (atomic_read(&j->j_async_throttle))
+    	blk_congestion_wait(WRITE, HZ/10);
+    return 0;
+}
 
 /*
 ** if this journal list still has commit blocks unflushed, send them to disk.
@@ -710,28 +960,40 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list
     goto put_jl;
   }
 
+  if (!list_empty(&jl->j_bh_list)) {
+      unlock_kernel();
+      write_ordered_buffers(&SB_JOURNAL(s)->j_dirty_buffers_lock,
+                            SB_JOURNAL(s), jl, &jl->j_bh_list);
+      lock_kernel();
+  }
+  if (!list_empty(&jl->j_bh_list))
+      BUG();
   /*
    * for the description block and all the log blocks, submit any buffers
    * that haven't already reached the disk
    */
+  atomic_inc(&SB_JOURNAL(s)->j_async_throttle);
   for (i = 0 ; i < (jl->j_len + 1) ; i++) {
     bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) %
          SB_ONDISK_JOURNAL_SIZE(s);
     tbh = journal_find_get_block(s, bn) ;
-    wait_on_buffer(tbh) ;
-    ll_rw_block(WRITE, 1, &tbh) ;
+    if (buffer_dirty(tbh))
+	ll_rw_block(WRITE, 1, &tbh) ;
     put_bh(tbh) ;
   }
+  atomic_dec(&SB_JOURNAL(s)->j_async_throttle);
 
   /* wait on everything written so far before writing the commit */
   for (i = 0 ;  i < (jl->j_len + 1) ; i++) {
     bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
 	 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ;
     tbh = journal_find_get_block(s, bn) ;
-
     wait_on_buffer(tbh) ;
+    // since we're using ll_rw_blk above, it might have skipped over
+    // a locked buffer.  Double check here
+    //
     if (buffer_dirty(tbh))
-      BUG();
+      sync_dirty_buffer(tbh);
     if (!buffer_uptodate(tbh)) {
       reiserfs_panic(s, "journal-601, buffer write failed\n") ;
     }
@@ -892,33 +1154,6 @@ restart:
     return 0 ;
 }
 
-static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) {
-    char b[BDEVNAME_SIZE];
-
-    if (buffer_journaled(bh)) {
-        reiserfs_warning("clm-2084: pinned buffer %lu:%s sent to disk\n",
-	                 bh->b_blocknr, bdevname(bh->b_bdev, b)) ;
-    }
-    if (uptodate)
-    	set_buffer_uptodate(bh) ;
-    else
-    	clear_buffer_uptodate(bh) ;
-    unlock_buffer(bh) ;
-    put_bh(bh) ;
-}
-
-static void submit_logged_buffer(struct buffer_head *bh) {
-    get_bh(bh) ;
-    bh->b_end_io = reiserfs_end_buffer_io_sync ;
-    mark_buffer_notjournal_new(bh) ;
-    clear_buffer_dirty(bh) ;
-    if (!test_and_clear_bit(BH_JTest, &bh->b_state))
-        BUG();
-    if (!buffer_uptodate(bh))
-        BUG();
-    submit_bh(WRITE, bh) ;
-}
-
 static void del_from_work_list(struct super_block *s,
                                struct reiserfs_journal_list *jl) {
     if (!list_empty(&jl->j_working_list)) {
@@ -1158,28 +1393,6 @@ flush_older_and_return:
   return 0 ;
 } 
 
-#define CHUNK_SIZE 32
-struct buffer_chunk {
-    struct buffer_head *bh[CHUNK_SIZE];
-    int nr;
-};
-
-static void write_chunk(struct buffer_chunk *chunk) {
-    int i;
-    for (i = 0; i < chunk->nr ; i++) {
-	submit_logged_buffer(chunk->bh[i]) ;
-    }
-    chunk->nr = 0;
-}
-
-static void add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh) {
-    if (chunk->nr >= CHUNK_SIZE)
-        BUG();
-    chunk->bh[chunk->nr++] = bh;
-    if (chunk->nr >= CHUNK_SIZE)
-        write_chunk(chunk);
-}
-
 static int write_one_transaction(struct super_block *s,
                                  struct reiserfs_journal_list *jl,
 				 struct buffer_chunk *chunk)
@@ -1214,7 +1427,7 @@ static int write_one_transaction(struct super_block *s,
 		if (!buffer_journal_dirty(tmp_bh) ||
 		    reiserfs_buffer_prepared(tmp_bh))
 		    BUG();
-		add_to_chunk(chunk, tmp_bh);
+		add_to_chunk(chunk, tmp_bh, NULL, write_chunk);
 		ret++;
 	    } else {
 		/* note, cn->bh might be null now */
@@ -1937,6 +2150,8 @@ retry:
     memset(jl, 0, sizeof(*jl));
     INIT_LIST_HEAD(&jl->j_list);
     INIT_LIST_HEAD(&jl->j_working_list);
+    INIT_LIST_HEAD(&jl->j_tail_bh_list);
+    INIT_LIST_HEAD(&jl->j_bh_list);
     sema_init(&jl->j_commit_lock, 1);
     SB_JOURNAL(s)->j_num_lists++;
     get_journal_list(jl);
@@ -2166,6 +2381,7 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
   SB_JOURNAL(p_s_sb)->j_len = 0 ;
   SB_JOURNAL(p_s_sb)->j_len_alloc = 0 ;
   atomic_set(&(SB_JOURNAL(p_s_sb)->j_wcount), 0) ;
+  atomic_set(&(SB_JOURNAL(p_s_sb)->j_async_throttle), 0) ;
   SB_JOURNAL(p_s_sb)->j_bcount = 0 ;	  
   SB_JOURNAL(p_s_sb)->j_trans_start_time = 0 ;	  
   SB_JOURNAL(p_s_sb)->j_last = NULL ;	  
@@ -2376,6 +2592,43 @@ relock:
   return 0 ;
 }
 
+struct reiserfs_transaction_handle *
+reiserfs_persistent_transaction(struct super_block *s, int nblocks) {
+    int ret ;
+    struct reiserfs_transaction_handle *th ;
+
+    /* if we're nesting into an existing transaction.  It will be
+    ** persistent on its own
+    */
+    if (reiserfs_transaction_running(s)) {
+        th = current->journal_info ;
+	th->t_refcount++ ;
+	if (th->t_refcount < 2) {
+	    BUG() ;
+	}
+	return th ;
+    }
+    th = reiserfs_kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS, s) ;
+    if (!th)
+       return NULL;
+    ret = journal_begin(th, s, nblocks) ;
+    if (ret) {
+	reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ;
+        return NULL;
+    }
+    return th ;
+}
+
+int
+reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) {
+    struct super_block *s = th->t_super;
+    int ret;
+    ret = journal_end(th, th->t_super, th->t_blocks_allocated);
+    if (th->t_refcount == 0)
+	reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ;
+    return ret;
+}
+
 static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
   struct reiserfs_transaction_handle *cur_th = current->journal_info;
 
@@ -2522,7 +2775,9 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_bloc
 int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
   if (!current->journal_info && th->t_refcount > 1)
     printk("REISER-NESTING: th NULL, refcount %d\n", th->t_refcount);
-  if (th->t_refcount > 1) {
+
+  th->t_refcount--;
+  if (th->t_refcount > 0) {
     struct reiserfs_transaction_handle *cur_th = current->journal_info ;
 
     /* we aren't allowed to close a nested transaction on a different
@@ -2531,7 +2786,6 @@ int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_
     if (cur_th->t_super != th->t_super)
       BUG() ;
 
-    th->t_refcount--;
     if (th != cur_th) {
       memcpy(current->journal_info, th, sizeof(*th));
       th->t_trans_id = 0;
@@ -2648,14 +2902,7 @@ int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block
 }
 
 /*
-** used to get memory back from async commits that are floating around
-** and to reclaim any blocks deleted but unusable because their commits
-** haven't hit disk yet.  called from bitmap.c
-**
-** if it starts flushing things, it ors SCHEDULE_OCCURRED into repeat.
-** note, this is just if schedule has a chance of occurring.  I need to 
-** change flush_commit_lists to have a repeat parameter too.
-**
+** writeback the pending async commits to disk
 */
 static void flush_async_commits(void *p) {
   struct super_block *p_s_sb = p;
@@ -2670,6 +2917,9 @@ static void flush_async_commits(void *p) {
       flush_commit_list(p_s_sb, jl, 1);
   }
   unlock_kernel();
+  atomic_inc(&SB_JOURNAL(p_s_sb)->j_async_throttle);
+  filemap_fdatawrite(p_s_sb->s_bdev->bd_inode->i_mapping);
+  atomic_dec(&SB_JOURNAL(p_s_sb)->j_async_throttle);
 }
 
 /*
@@ -3072,6 +3322,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
   if (!check_journal_end(th, p_s_sb, nblocks, flags)) {
     p_s_sb->s_dirt = 1;
     wake_queued_writers(p_s_sb);
+    reiserfs_async_progress_wait(p_s_sb);
     goto out ;
   }
 
@@ -3248,23 +3499,38 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
   SB_JOURNAL(p_s_sb)->j_next_async_flush = 0 ;
   init_journal_hash(p_s_sb) ; 
 
+  // make sure reiserfs_add_jh sees the new current_jl before we
+  // write out the tails
+  smp_mb();
+
   /* tail conversion targets have to hit the disk before we end the
    * transaction.  Otherwise a later transaction might repack the tail
    * before this transaction commits, leaving the data block unflushed and
    * clean, if we crash before the later transaction commits, the data block
    * is lost.
    */
-  fsync_buffers_list(&(SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock),
-		     &(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ;
+  if (!list_empty(&jl->j_tail_bh_list)) {
+      unlock_kernel();
+      write_ordered_buffers(&SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock,
+			    SB_JOURNAL(p_s_sb), jl, &jl->j_tail_bh_list);
+      lock_kernel();
+  }
+  if (!list_empty(&jl->j_tail_bh_list))
+      BUG();
   up(&jl->j_commit_lock);
 
   /* honor the flush wishes from the caller, simple commits can
   ** be done outside the journal lock, they are done below
+  **
+  ** if we don't flush the commit list right now, we put it into
+  ** the work queue so the people waiting on the async progress work
+  ** queue don't wait for this proc to flush journal lists and such.
   */
   if (flush) {
     flush_commit_list(p_s_sb, jl, 1) ;
     flush_journal_list(p_s_sb, jl, 1) ;
-  }
+  } else
+    queue_work(commit_wq, &SB_JOURNAL(p_s_sb)->j_work);
 
 
   /* if the next transaction has any chance of wrapping, flush 
@@ -3322,15 +3588,12 @@ first_jl:
   clear_bit(WRITERS_QUEUED, &SB_JOURNAL(p_s_sb)->j_state);
   wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
 
-  if (!flush) {
-      if (wait_on_commit) {
-	  if (journal_list_still_alive(p_s_sb, commit_trans_id))
-	      flush_commit_list(p_s_sb, jl, 1) ;
-      } else {
-          queue_work(commit_wq, &SB_JOURNAL(p_s_sb)->j_work);
-      }
+  if (!flush && wait_on_commit &&
+      journal_list_still_alive(p_s_sb, commit_trans_id)) {
+	  flush_commit_list(p_s_sb, jl, 1) ;
   }
 out:
   reiserfs_check_lock_depth("journal end2");
+  th->t_trans_id = 0;
   return 0 ;
 }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f75349fe4787..57991831eeef 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -510,6 +510,14 @@ typedef struct {
 		    applied BEFORE setmask */
 } opt_desc_t;
 
+/* possible values for -o data= */
+static const arg_desc_t logging_mode[] = {
+    {"ordered", 1<<REISERFS_DATA_ORDERED, (1<<REISERFS_DATA_LOG|1<<REISERFS_DATA_WRITEBACK)},
+    {"journal", 1<<REISERFS_DATA_LOG, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_WRITEBACK)},
+    {"writeback", 1<<REISERFS_DATA_WRITEBACK, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_LOG)},
+    {NULL, 0}
+};
+
 /* possible values for "-o block-allocator=" and bits which are to be set in
    s_mount_opt of reiserfs specific part of in-core super block */
 static const arg_desc_t balloc[] = {
@@ -664,6 +672,7 @@ static int reiserfs_parse_options (struct super_block * s, char * options, /* st
 	{"nolog", 0, 0, 0, 0}, /* This is unsupported */
 	{"replayonly", 0, 0, 1<<REPLAYONLY, 0},
 	{"block-allocator", 'a', balloc, 0, 0},
+	{"data", 'd', logging_mode, 0, 0},
 	{"resize", 'r', 0, 0, 0},
 	{"jdev", 'j', 0, 0, 0},
 	{"nolargeio", 'w', 0, 0, 0},
@@ -737,6 +746,33 @@ static int reiserfs_parse_options (struct super_block * s, char * options, /* st
     return 1;
 }
 
+static void switch_data_mode(struct super_block *s, unsigned long mode) {
+    REISERFS_SB(s)->s_mount_opt &= ~((1 << REISERFS_DATA_LOG) |
+                                       (1 << REISERFS_DATA_ORDERED) |
+				       (1 << REISERFS_DATA_WRITEBACK));
+    REISERFS_SB(s)->s_mount_opt |= (1 << mode);
+}
+
+static void handle_data_mode(struct super_block *s, unsigned long mount_options)
+{
+    if (mount_options & (1 << REISERFS_DATA_LOG)) {
+        if (!reiserfs_data_log(s)) {
+	    switch_data_mode(s, REISERFS_DATA_LOG);
+	    printk("reiserfs: switching to journaled data mode\n");
+	}
+    } else if (mount_options & (1 << REISERFS_DATA_ORDERED)) {
+        if (!reiserfs_data_ordered(s)) {
+	    switch_data_mode(s, REISERFS_DATA_ORDERED);
+	    printk("reiserfs: switching to ordered data mode\n");
+	}
+    } else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) {
+        if (!reiserfs_data_writeback(s)) {
+	    switch_data_mode(s, REISERFS_DATA_WRITEBACK);
+	    printk("reiserfs: switching to writeback data mode\n");
+	}
+    }
+}
+
 static void handle_attrs( struct super_block *s )
 {
 	struct reiserfs_super_block * rs;
@@ -814,6 +850,7 @@ static int reiserfs_remount (struct super_block * s, int * mount_flags, char * a
     if (!(s->s_flags & MS_RDONLY))
 	return 0; /* We are read-write already */
 
+    handle_data_mode(s, mount_options);
     REISERFS_SB(s)->s_mount_state = sb_umount_state(rs) ;
     s->s_flags &= ~MS_RDONLY ; /* now it is safe to call journal_begin */
     journal_begin(&th, s, 10) ;
@@ -1306,6 +1343,21 @@ static int reiserfs_fill_super (struct super_block * s, void * data, int silent)
     SPRINTK(silent, "reiserfs:warning: - it is slow mode for debugging.\n");
 #endif
 
+    /* make data=ordered the default */
+    if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) &&
+        !reiserfs_data_writeback(s))
+    {
+         REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
+    }
+
+    if (reiserfs_data_log(s)) {
+        printk("reiserfs: using journaled data mode\n");
+    } else if (reiserfs_data_ordered(s)) {
+        printk("reiserfs: using ordered data mode\n");
+    } else {
+        printk("reiserfs: using writeback data mode\n");
+    }
+
     // set_device_ro(s->s_dev, 1) ;
     if( journal_init(s, jdev_name, old_format, commit_max_age) ) {
 	SPRINTK(silent, "sh-2022: reiserfs_fill_super: unable to initialize journal space\n") ;
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index fb0bf2af7fd7..31e8047f0f41 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1707,6 +1707,15 @@ struct reiserfs_journal_header {
 #define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
 #define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
 
+enum reiserfs_bh_state_bits {
+    BH_JDirty = BH_PrivateStart,
+    BH_JDirty_wait,
+    BH_JNew,
+    BH_JPrepared,
+    BH_JRestore_dirty,
+    BH_JTest, // debugging only will go away
+};
+
 /*
 ** transaction handle which is passed around for all journal calls
 */
@@ -1726,7 +1735,36 @@ struct reiserfs_transaction_handle {
 				   should be displaced from others */
 } ;
 
+/* used to keep track of ordered and tail writes, attached to the buffer
+ * head through b_journal_head.
+ */
+struct reiserfs_jh {
+    struct reiserfs_journal_list *jl;
+    struct buffer_head *bh;
+    struct list_head list;
+};
+
+void reiserfs_free_jh(struct buffer_head *bh);
+int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh);
+int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh);
 int journal_mark_dirty(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
+
+static inline int reiserfs_transaction_running(struct super_block *s) {
+    struct reiserfs_transaction_handle *th = current->journal_info ;
+    if (th && th->t_super == s)
+        return 1 ;
+    if (th && th->t_super == NULL)
+        BUG();
+    return 0 ;
+}
+
+int reiserfs_async_progress_wait(struct super_block *s);
+
+struct reiserfs_transaction_handle *
+reiserfs_persistent_transaction(struct super_block *, int count);
+int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *);
+int reiserfs_commit_page(struct inode *inode, struct page *page,
+		unsigned from, unsigned to);
 int reiserfs_flush_old_commits(struct super_block *);
 void reiserfs_commit_for_inode(struct inode *) ;
 void reiserfs_update_inode_transaction(struct inode *) ;
@@ -1741,7 +1779,6 @@ int journal_release(struct reiserfs_transaction_handle*, struct super_block *) ;
 int journal_release_error(struct reiserfs_transaction_handle*, struct super_block *) ;
 int journal_end(struct reiserfs_transaction_handle *, struct super_block *, unsigned long) ;
 int journal_end_sync(struct reiserfs_transaction_handle *, struct super_block *, unsigned long) ;
-int journal_mark_dirty_nolog(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
 int journal_mark_freed(struct reiserfs_transaction_handle *, struct super_block *, b_blocknr_t blocknr) ;
 int journal_transaction_should_end(struct reiserfs_transaction_handle *, int) ;
 int reiserfs_in_journal(struct super_block *p_s_sb, int bmap_nr, int bit_nr, int searchall, b_blocknr_t *next) ;
@@ -1749,11 +1786,6 @@ int journal_begin(struct reiserfs_transaction_handle *, struct super_block *p_s_
 
 int buffer_journaled(const struct buffer_head *bh) ;
 int mark_buffer_journal_new(struct buffer_head *bh) ;
-int reiserfs_add_page_to_flush_list(struct reiserfs_transaction_handle *,
-                                    struct inode *, struct buffer_head *) ;
-int reiserfs_remove_page_from_flush_list(struct reiserfs_transaction_handle *,
-                                         struct inode *) ;
-
 int reiserfs_allocate_list_bitmaps(struct super_block *s, struct reiserfs_list_bitmap *, int) ;
 
 				/* why is this kerplunked right here? */
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index e1fe3ebe33c0..3248dcf369f2 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -107,21 +107,6 @@ typedef enum {
 #define JOURNAL_HASH_SIZE 8192   
 #define JOURNAL_NUM_BITMAPS 5 /* number of copies of the bitmaps to have floating.  Must be >= 2 */
 
-/* these are bh_state bit flag offset numbers, for use in the buffer head */
-
-#define BH_JDirty       16      /* journal data needs to be written before buffer can be marked dirty */
-#define BH_JDirty_wait 18	/* commit is done, buffer marked dirty */
-#define BH_JNew 19		/* buffer allocated during this transaction, no need to write if freed during this trans too */
-
-/* ugly.  metadata blocks must be prepared before they can be logged.  
-** prepared means unlocked and cleaned.  If the block is prepared, but not
-** logged for some reason, any bits cleared while preparing it must be 
-** set again.
-*/
-#define BH_JPrepared 20		/* block has been prepared for the log */
-#define BH_JRestore_dirty 22    /* restore the dirty bit later */
-#define BH_JTest 23             /* debugging use only */
-
 /* One of these for every block in every transaction
 ** Each one is in two hash tables.  First, a hash of the current transaction, and after journal_end, a
 ** hash of all the in memory transactions.
@@ -178,6 +163,11 @@ struct reiserfs_journal_list {
 
   /* time ordered list of all transactions we haven't tried to flush yet */
   struct list_head j_working_list;
+
+  /* list of tail conversion targets in need of flush before commit */
+  struct list_head j_tail_bh_list;
+  /* list of data=ordered buffers in need of flush before commit */
+  struct list_head j_bh_list;
   int j_refcount;
 } ;
 
@@ -253,7 +243,9 @@ struct reiserfs_journal {
   unsigned long j_max_trans_size ;
   unsigned long j_max_batch_size ;
 
+  /* when flushing ordered buffers, throttle new ordered writers */
   struct work_struct j_work;
+  atomic_t j_async_throttle;
 };
 
 #define JOURNAL_DESC_MAGIC "ReIsErLB" /* ick.  magic string to find desc blocks in the journal */
@@ -408,11 +400,12 @@ struct reiserfs_sb_info
 #define REISERFS_3_5 0
 #define REISERFS_3_6 1
 
+enum reiserfs_mount_options {
 /* Mount options */
-#define REISERFS_LARGETAIL 0  /* large tails will be created in a session */
-#define REISERFS_SMALLTAIL 17  /* small (for files less than block size) tails will be created in a session */
-#define REPLAYONLY 3 /* replay journal and return 0. Use by fsck */
-#define REISERFS_CONVERT 5    /* -o conv: causes conversion of old
+    REISERFS_LARGETAIL,  /* large tails will be created in a session */
+    REISERFS_SMALLTAIL,  /* small (for files less than block size) tails will be created in a session */
+    REPLAYONLY, /* replay journal and return 0. Use by fsck */
+    REISERFS_CONVERT,    /* -o conv: causes conversion of old
                                  format super block to the new
                                  format. If not specified - old
                                  partition will be dealt with in a
@@ -426,26 +419,29 @@ struct reiserfs_sb_info
 ** the existing hash on the FS, so if you have a tea hash disk, and mount
 ** with -o hash=rupasov, the mount will fail.
 */
-#define FORCE_TEA_HASH 6      /* try to force tea hash on mount */
-#define FORCE_RUPASOV_HASH 7  /* try to force rupasov hash on mount */
-#define FORCE_R5_HASH 8       /* try to force rupasov hash on mount */
-#define FORCE_HASH_DETECT 9   /* try to detect hash function on mount */
+    FORCE_TEA_HASH,      /* try to force tea hash on mount */
+    FORCE_RUPASOV_HASH,  /* try to force rupasov hash on mount */
+    FORCE_R5_HASH,       /* try to force rupasov hash on mount */
+    FORCE_HASH_DETECT,   /* try to detect hash function on mount */
 
+    REISERFS_DATA_LOG,
+    REISERFS_DATA_ORDERED,
+    REISERFS_DATA_WRITEBACK,
 
 /* used for testing experimental features, makes benchmarking new
    features with and without more convenient, should never be used by
    users in any code shipped to users (ideally) */
 
-#define REISERFS_NO_BORDER 11
-#define REISERFS_NO_UNHASHED_RELOCATION 12
-#define REISERFS_HASHED_RELOCATION 13
-
-#define REISERFS_ATTRS 15
+    REISERFS_NO_BORDER,
+    REISERFS_NO_UNHASHED_RELOCATION,
+    REISERFS_HASHED_RELOCATION,
+    REISERFS_ATTRS,
 
-#define REISERFS_TEST1 11
-#define REISERFS_TEST2 12
-#define REISERFS_TEST3 13
-#define REISERFS_TEST4 14 
+    REISERFS_TEST1,
+    REISERFS_TEST2,
+    REISERFS_TEST3,
+    REISERFS_TEST4,
+};
 
 #define reiserfs_r5_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_R5_HASH))
 #define reiserfs_rupasov_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_RUPASOV_HASH))
@@ -459,11 +455,12 @@ struct reiserfs_sb_info
 #define have_large_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_LARGETAIL))
 #define have_small_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_SMALLTAIL))
 #define replay_only(s) (REISERFS_SB(s)->s_mount_opt & (1 << REPLAYONLY))
-#define reiserfs_dont_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NOLOG))
 #define reiserfs_attrs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ATTRS))
 #define old_format_only(s) (REISERFS_SB(s)->s_properties & (1 << REISERFS_3_5))
 #define convert_reiserfs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_CONVERT))
-
+#define reiserfs_data_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_LOG))
+#define reiserfs_data_ordered(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_ORDERED))
+#define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK))
 
 void reiserfs_file_buffer (struct buffer_head * bh, int list);
 extern struct file_system_type reiserfs_fs_type;
author	Andrew Morton <akpm@osdl.org>	2004-04-11 23:26:08 -0700
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2004-04-11 23:26:08 -0700
commit	bb0d96728fb63cf1d2294bb1dcafd60926e49cd5 (patch)
tree	b79a5619fa5ec271b3bd06f221065f57ecce6619
parent	7c563ced265e3134a5c5c5b7ca2b31218993a204 (diff)