10 files changed, 141 insertions, 41 deletions
diff --git a/Makefile b/Makefile
index df7d891c9f15..eda1d3e00a89 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 4
 SUBLEVEL = 1
-EXTRAVERSION =-pre5
+EXTRAVERSION =-pre6
 
 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
 
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 158d183b8c17..7c02813afc8c 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -23,6 +23,7 @@
 #include <linux/delay.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
+#include <linux/highmem.h>
 
 #ifdef CONFIG_MCA
 #include <linux/mca.h>
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 9ac560a6c054..f4e41801bb67 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -19,6 +19,7 @@
 #include <linux/config.h>
 #include <linux/locks.h>
 #include <linux/mm.h>
+#include <linux/swap.h>
 #include <linux/init.h>
 #include <linux/smp_lock.h>
 
@@ -118,6 +119,19 @@ int * max_readahead[MAX_BLKDEV];
  */
 int * max_sectors[MAX_BLKDEV];
 
+/*
+ * queued sectors for all devices, used to make sure we don't fill all
+ * of memory with locked buffers
+ */
+atomic_t queued_sectors;
+
+/*
+ * high and low watermark for above
+ */
+static int high_queued_sectors, low_queued_sectors;
+static int batch_requests, queue_nr_requests;
+static DECLARE_WAIT_QUEUE_HEAD(blk_buffers_wait);
+
 static inline int get_max_sectors(kdev_t dev)
 {
 	if (!max_sectors[MAJOR(dev)])
@@ -185,7 +199,7 @@ static int __blk_cleanup_queue(struct list_head *head)
  **/
 void blk_cleanup_queue(request_queue_t * q)
 {
-	int count = QUEUE_NR_REQUESTS;
+	int count = queue_nr_requests;
 
 	count -= __blk_cleanup_queue(&q->request_freelist[READ]);
 	count -= __blk_cleanup_queue(&q->request_freelist[WRITE]);
@@ -385,7 +399,7 @@ static void blk_init_free_list(request_queue_t *q)
 	/*
 	 * Divide requests in half between read and write
 	 */
-	for (i = 0; i < QUEUE_NR_REQUESTS; i++) {
+	for (i = 0; i < queue_nr_requests; i++) {
 		rq = kmem_cache_alloc(request_cachep, SLAB_KERNEL);
 		memset(rq, 0, sizeof(struct request));
 		rq->rq_status = RQ_INACTIVE;
@@ -559,14 +573,12 @@ inline void drive_stat_acct (kdev_t dev, int rw,
 
 /*
  * add-request adds a request to the linked list.
- * It disables interrupts (acquires the request spinlock) so that it can muck
- * with the request-lists in peace. Thus it should be called with no spinlocks
- * held.
+ * io_request_lock is held and interrupts disabled, as we muck with the
+ * request queue list.
  *
  * By this point, req->cmd is always either READ/WRITE, never READA,
  * which is important for drive_stat_acct() above.
  */
-
 static inline void add_request(request_queue_t * q, struct request * req,
 			       struct list_head *insert_here)
 {
@@ -622,9 +634,17 @@ void inline blkdev_release_request(struct request *req)
 	req->q = NULL;
 
 	/*
-	 * Request may not have originated from ll_rw_blk
+	 * Request may not have originated from ll_rw_blk. if not,
+	 * asumme it has free buffers and check waiters
 	 */
 	if (q) {
+		/*
+		 * we've released enough buffers to start I/O again
+		 */
+		if (waitqueue_active(&blk_buffers_wait)
+		    && atomic_read(&queued_sectors) < low_queued_sectors)
+			wake_up(&blk_buffers_wait);
+
 		if (!list_empty(&q->request_freelist[rw])) {
 			blk_refill_freelist(q, rw);
 			list_add(&req->table, &q->request_freelist[rw]);
@@ -637,7 +657,7 @@ void inline blkdev_release_request(struct request *req)
 		 */
 		list_add(&req->table, &q->pending_freelist[rw]);
 
-		if (++q->pending_free[rw] >= (QUEUE_NR_REQUESTS >> 4)) {
+		if (++q->pending_free[rw] >= batch_requests) {
 			int wake_up = q->pending_free[rw];
 			blk_refill_freelist(q, rw);
 			wake_up_nr(&q->wait_for_request, wake_up);
@@ -669,7 +689,7 @@ static void attempt_merge(request_queue_t * q,
 	 * will have been updated to the appropriate number,
 	 * and we shouldn't do it here too.
 	 */
-	if(!q->merge_requests_fn(q, req, next, max_segments))
+	if (!q->merge_requests_fn(q, req, next, max_segments))
 		return;
 
 	q->elevator.elevator_merge_req_fn(req, next);
@@ -755,13 +775,13 @@ static int __make_request(request_queue_t * q, int rw,
 	max_sectors = get_max_sectors(bh->b_rdev);
 
 again:
+	head = &q->queue_head;
 	/*
 	 * Now we acquire the request spinlock, we have to be mega careful
 	 * not to schedule or do something nonatomic
 	 */
 	spin_lock_irq(&io_request_lock);
 
-	head = &q->queue_head;
 	insert_here = head->prev;
 	if (list_empty(head)) {
 		q->plug_device_fn(q, bh->b_rdev); /* is atomic */
@@ -780,6 +800,7 @@ again:
 			req->bhtail->b_reqnext = bh;
 			req->bhtail = bh;
 			req->nr_sectors = req->hard_nr_sectors += count;
+			blk_started_io(count);
 			drive_stat_acct(req->rq_dev, req->cmd, count, 0);
 			attempt_back_merge(q, req, max_sectors, max_segments);
 			goto out;
@@ -794,6 +815,7 @@ again:
 			req->current_nr_sectors = count;
 			req->sector = req->hard_sector = sector;
 			req->nr_sectors = req->hard_nr_sectors += count;
+			blk_started_io(count);
 			drive_stat_acct(req->rq_dev, req->cmd, count, 0);
 			attempt_front_merge(q, head, req, max_sectors, max_segments);
 			goto out;
@@ -817,10 +839,9 @@ again:
 	}
 		
 	/*
-	 * Grab a free request from the freelist. Read first try their
-	 * own queue - if that is empty, we steal from the write list.
-	 * Writes must block if the write list is empty, and read aheads
-	 * are not crucial.
+	 * Grab a free request from the freelist - if that is empty, check
+	 * if we are doing read ahead and abort instead of blocking for
+	 * a free slot.
 	 */
 get_rq:
 	if (freereq) {
@@ -849,6 +870,7 @@ get_rq:
 	req->bh = bh;
 	req->bhtail = bh;
 	req->rq_dev = bh->b_rdev;
+	blk_started_io(count);
 	add_request(q, req, insert_here);
 out:
 	if (freereq)
@@ -901,13 +923,13 @@ void generic_make_request (int rw, struct buffer_head * bh)
 	int major = MAJOR(bh->b_rdev);
 	request_queue_t *q;
 
-	if (!bh->b_end_io) BUG();
+	if (!bh->b_end_io)
+		BUG();
+
 	if (blk_size[major]) {
 		unsigned long maxsector = (blk_size[major][MINOR(bh->b_rdev)] << 1) + 1;
-		unsigned int sector, count;
-
-		count = bh->b_size >> 9;
-		sector = bh->b_rsector;
+		unsigned long sector = bh->b_rsector;
+		unsigned int count = bh->b_size >> 9;
 
 		if (maxsector < count || maxsector - count < sector) {
 			bh->b_state &= (1 << BH_Lock) | (1 << BH_Mapped);
@@ -918,7 +940,7 @@ void generic_make_request (int rw, struct buffer_head * bh)
 				   when mounting a device. */
 				printk(KERN_INFO
 				       "attempt to access beyond end of device\n");
-				printk(KERN_INFO "%s: rw=%d, want=%d, limit=%d\n",
+				printk(KERN_INFO "%s: rw=%d, want=%ld, limit=%d\n",
 				       kdevname(bh->b_rdev), rw,
 				       (sector + count)>>1,
 				       blk_size[major][MINOR(bh->b_rdev)]);
@@ -945,14 +967,13 @@ void generic_make_request (int rw, struct buffer_head * bh)
 			buffer_IO_error(bh);
 			break;
 		}
-	}
-	while (q->make_request_fn(q, rw, bh));
+	} while (q->make_request_fn(q, rw, bh));
 }
 
 
 /**
  * submit_bh: submit a buffer_head to the block device later for I/O
- * @rw: whether to %READ or %WRITE, or mayve to %READA (read ahead)
+ * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
  * @bh: The &struct buffer_head which describes the I/O
  *
  * submit_bh() is very similar in purpose to generic_make_request(), and
@@ -975,7 +996,7 @@ void submit_bh(int rw, struct buffer_head * bh)
 	 * further remap this.
 	 */
 	bh->b_rdev = bh->b_dev;
-	bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
+	bh->b_rsector = bh->b_blocknr * (bh->b_size >> 9);
 
 	generic_make_request(rw, bh);
 
@@ -1050,8 +1071,7 @@ void ll_rw_block(int rw, int nr, struct buffer_head * bhs[])
 
 	/* Verify requested block sizes. */
 	for (i = 0; i < nr; i++) {
-		struct buffer_head *bh;
-		bh = bhs[i];
+		struct buffer_head *bh = bhs[i];
 		if (bh->b_size % correct_size) {
 			printk(KERN_NOTICE "ll_rw_block: device %s: "
 			       "only %d-char blocks implemented (%u)\n",
@@ -1068,8 +1088,17 @@ void ll_rw_block(int rw, int nr, struct buffer_head * bhs[])
 	}
 
 	for (i = 0; i < nr; i++) {
-		struct buffer_head *bh;
-		bh = bhs[i];
+		struct buffer_head *bh = bhs[i];
+
+		/*
+		 * don't lock any more buffers if we are above the high
+		 * water mark. instead start I/O on the queued stuff.
+		 */
+		if (atomic_read(&queued_sectors) >= high_queued_sectors) {
+			run_task_queue(&tq_disk);
+			wait_event(blk_buffers_wait,
+			 atomic_read(&queued_sectors) < low_queued_sectors);
+		}
 
 		/* Only one thread can actually submit the I/O. */
 		if (test_and_set_bit(BH_Lock, &bh->b_state))
@@ -1132,6 +1161,7 @@ int end_that_request_first (struct request *req, int uptodate, char *name)
 
 	if ((bh = req->bh) != NULL) {
 		nsect = bh->b_size >> 9;
+		blk_finished_io(nsect);
 		req->bh = bh->b_reqnext;
 		bh->b_reqnext = NULL;
 		bh->b_end_io(bh, uptodate);
@@ -1161,9 +1191,12 @@ void end_that_request_last(struct request *req)
 	blkdev_release_request(req);
 }
 
+#define MB(kb)	((kb) << 10)
+
 int __init blk_dev_init(void)
 {
 	struct blk_dev_struct *dev;
+	int total_ram;
 
 	request_cachep = kmem_cache_create("blkdev_requests",
 					   sizeof(struct request),
@@ -1178,6 +1211,51 @@ int __init blk_dev_init(void)
 	memset(ro_bits,0,sizeof(ro_bits));
 	memset(max_readahead, 0, sizeof(max_readahead));
 	memset(max_sectors, 0, sizeof(max_sectors));
+
+	atomic_set(&queued_sectors, 0);
+	total_ram = nr_free_pages() << (PAGE_SHIFT - 10);
+
+	/*
+	 * Try to keep 128MB max hysteris. If not possible,
+	 * use half of RAM
+	 */
+	high_queued_sectors = (total_ram * 2) / 3;
+	low_queued_sectors = high_queued_sectors - MB(128);
+	if (low_queued_sectors < 0)
+		low_queued_sectors = total_ram / 2;
+
+	/*
+	 * for big RAM machines (>= 384MB), use more for I/O
+	 */
+	if (total_ram >= MB(384)) {
+		high_queued_sectors = (total_ram * 4) / 5;
+		low_queued_sectors = high_queued_sectors - MB(128);
+	}
+
+	/*
+	 * make it sectors (512b)
+	 */
+	high_queued_sectors <<= 1;
+	low_queued_sectors <<= 1;
+
+	/*
+	 * Scale free request slots per queue too
+	 */
+	total_ram = (total_ram + MB(32) - 1) & ~(MB(32) - 1);
+	if ((queue_nr_requests = total_ram >> 9) > QUEUE_NR_REQUESTS)
+		queue_nr_requests = QUEUE_NR_REQUESTS;
+
+	/*
+	 * adjust batch frees according to queue length, with upper limit
+	 */
+	if ((batch_requests = queue_nr_requests >> 3) > 32)
+		batch_requests = 32;
+
+	printk("block: queued sectors max/low %dkB/%dkB, %d slots per queue\n",
+						high_queued_sectors / 2,
+						low_queued_sectors / 2,
+						queue_nr_requests);
+
 #ifdef CONFIG_AMIGA_Z2RAM
 	z2_init();
 #endif
@@ -1300,3 +1378,4 @@ EXPORT_SYMBOL(blk_queue_make_request);
 EXPORT_SYMBOL(generic_make_request);
 EXPORT_SYMBOL(blkdev_release_request);
 EXPORT_SYMBOL(generic_unplug_device);
+EXPORT_SYMBOL(queued_sectors);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 04ccfe6d6d75..dcde48437493 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -362,6 +362,7 @@ static Scsi_Cmnd *__scsi_end_request(Scsi_Cmnd * SCpnt,
 	struct request *req;
 	struct buffer_head *bh;
         Scsi_Device * SDpnt;
+	int nsect;
 
 	ASSERT_LOCK(&io_request_lock, 0);
 
@@ -373,11 +374,13 @@ static Scsi_Cmnd *__scsi_end_request(Scsi_Cmnd * SCpnt,
 	}
 	do {
 		if ((bh = req->bh) != NULL) {
+			nsect = bh->b_size >> 9;
+			blk_finished_io(nsect);
 			req->bh = bh->b_reqnext;
-			req->nr_sectors -= bh->b_size >> 9;
-			req->sector += bh->b_size >> 9;
+			req->nr_sectors -= nsect;
+			req->sector += nsect;
 			bh->b_reqnext = NULL;
-			sectors -= bh->b_size >> 9;
+			sectors -= nsect;
 			bh->b_end_io(bh, uptodate);
 			if ((bh = req->bh) != NULL) {
 				req->current_nr_sectors = bh->b_size >> 9;
diff --git a/fs/inode.c b/fs/inode.c
index c85ed6426331..7939267f4502 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -134,12 +134,17 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 void __mark_inode_dirty(struct inode *inode, int flags)
 {
 	struct super_block * sb = inode->i_sb;
+
 	if (sb) {
 		/* Don't do this for I_DIRTY_PAGES - that doesn't actually dirty the inode itself */
 		if (flags & (I_DIRTY | I_DIRTY_SYNC)) {
 			if (sb->s_op && sb->s_op->dirty_inode)
 				sb->s_op->dirty_inode(inode);
 		}
+		/* avoid the locking if we can */
+		if ((inode->i_state & flags) != flags) {
+			return ;
+		}
 		spin_lock(&inode_lock);
 		if ((inode->i_state & flags) != flags) {
 			inode->i_state |= flags;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 00c16769e002..f95e9f6241cc 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -17,6 +17,7 @@
 #include <linux/elf.h>
 #include <linux/elfcore.h>
 #include <linux/vmalloc.h>
+#include <linux/highmem.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0ea8f7064ba7..01cd38a8368e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -64,9 +64,10 @@ typedef void (plug_device_fn) (request_queue_t *q, kdev_t device);
 typedef void (unplug_device_fn) (void *q);
 
 /*
- * Default nr free requests per queue
+ * Default nr free requests per queue, ll_rw_blk will scale it down
+ * according to available RAM at init time
  */
-#define QUEUE_NR_REQUESTS	512
+#define QUEUE_NR_REQUESTS	8192
 
 struct request_queue
 {
@@ -176,6 +177,8 @@ extern int * max_sectors[MAX_BLKDEV];
 
 extern int * max_segments[MAX_BLKDEV];
 
+extern atomic_t queued_sectors;
+
 #define MAX_SEGMENTS 128
 #define MAX_SECTORS (MAX_SEGMENTS*8)
 
@@ -203,5 +206,14 @@ static inline int get_hardsect_size(kdev_t dev)
 		return 512;
 }
 
+#define blk_finished_io(nsects)				\
+	atomic_sub(nsects, &queued_sectors);		\
+	if (atomic_read(&queued_sectors) < 0) {		\
+		printk("block: queued_sectors < 0\n");	\
+		atomic_set(&queued_sectors, 0);		\
+	}
+
+#define blk_started_io(nsects)				\
+	atomic_add(nsects, &queued_sectors);
 
 #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f2d99107bf79..686aef31a600 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -825,20 +825,17 @@ struct super_operations {
 extern void __mark_inode_dirty(struct inode *, int);
 static inline void mark_inode_dirty(struct inode *inode)
 {
-	if ((inode->i_state & I_DIRTY) != I_DIRTY)
-		__mark_inode_dirty(inode, I_DIRTY);
+	__mark_inode_dirty(inode, I_DIRTY);
 }
 
 static inline void mark_inode_dirty_sync(struct inode *inode)
 {
-	if (!(inode->i_state & I_DIRTY_SYNC))
-		__mark_inode_dirty(inode, I_DIRTY_SYNC);
+	__mark_inode_dirty(inode, I_DIRTY_SYNC);
 }
 
 static inline void mark_inode_dirty_pages(struct inode *inode)
 {
-	if (inode && !(inode->i_state & I_DIRTY_PAGES))
-		__mark_inode_dirty(inode, I_DIRTY_PAGES);
+	__mark_inode_dirty(inode, I_DIRTY_PAGES);
 }
 
 struct dquot_operations {
diff --git a/mm/filemap.c b/mm/filemap.c
index cd97dc77f5ad..4c89ad3e9156 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -143,7 +143,8 @@ void __set_page_dirty(struct page *page)
 	list_add(&page->list, &mapping->dirty_pages);
 	spin_unlock(&pagecache_lock);
 
-	mark_inode_dirty_pages(mapping->host);
+	if (mapping->host)
+		mark_inode_dirty_pages(mapping->host);
 }
 
 /**
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 62ce5f1ffed8..93edab662caa 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -9,6 +9,7 @@
 #include <linux/malloc.h>
 #include <linux/vmalloc.h>
 #include <linux/spinlock.h>
+#include <linux/highmem.h>
 #include <linux/smp_lock.h>
 
 #include <asm/uaccess.h>