summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile2
-rw-r--r--arch/i386/kernel/traps.c1
-rw-r--r--drivers/block/ll_rw_blk.c135
-rw-r--r--drivers/scsi/scsi_lib.c9
-rw-r--r--fs/inode.c5
-rw-r--r--fs/proc/kcore.c1
-rw-r--r--include/linux/blkdev.h16
-rw-r--r--include/linux/fs.h9
-rw-r--r--mm/filemap.c3
-rw-r--r--mm/vmalloc.c1
10 files changed, 141 insertions, 41 deletions
diff --git a/Makefile b/Makefile
index df7d891c9f15..eda1d3e00a89 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
VERSION = 2
PATCHLEVEL = 4
SUBLEVEL = 1
-EXTRAVERSION =-pre5
+EXTRAVERSION =-pre6
KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 158d183b8c17..7c02813afc8c 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -23,6 +23,7 @@
#include <linux/delay.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
+#include <linux/highmem.h>
#ifdef CONFIG_MCA
#include <linux/mca.h>
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 9ac560a6c054..f4e41801bb67 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -19,6 +19,7 @@
#include <linux/config.h>
#include <linux/locks.h>
#include <linux/mm.h>
+#include <linux/swap.h>
#include <linux/init.h>
#include <linux/smp_lock.h>
@@ -118,6 +119,19 @@ int * max_readahead[MAX_BLKDEV];
*/
int * max_sectors[MAX_BLKDEV];
+/*
+ * queued sectors for all devices, used to make sure we don't fill all
+ * of memory with locked buffers
+ */
+atomic_t queued_sectors;
+
+/*
+ * high and low watermark for above
+ */
+static int high_queued_sectors, low_queued_sectors;
+static int batch_requests, queue_nr_requests;
+static DECLARE_WAIT_QUEUE_HEAD(blk_buffers_wait);
+
static inline int get_max_sectors(kdev_t dev)
{
if (!max_sectors[MAJOR(dev)])
@@ -185,7 +199,7 @@ static int __blk_cleanup_queue(struct list_head *head)
**/
void blk_cleanup_queue(request_queue_t * q)
{
- int count = QUEUE_NR_REQUESTS;
+ int count = queue_nr_requests;
count -= __blk_cleanup_queue(&q->request_freelist[READ]);
count -= __blk_cleanup_queue(&q->request_freelist[WRITE]);
@@ -385,7 +399,7 @@ static void blk_init_free_list(request_queue_t *q)
/*
* Divide requests in half between read and write
*/
- for (i = 0; i < QUEUE_NR_REQUESTS; i++) {
+ for (i = 0; i < queue_nr_requests; i++) {
rq = kmem_cache_alloc(request_cachep, SLAB_KERNEL);
memset(rq, 0, sizeof(struct request));
rq->rq_status = RQ_INACTIVE;
@@ -559,14 +573,12 @@ inline void drive_stat_acct (kdev_t dev, int rw,
/*
* add-request adds a request to the linked list.
- * It disables interrupts (acquires the request spinlock) so that it can muck
- * with the request-lists in peace. Thus it should be called with no spinlocks
- * held.
+ * io_request_lock is held and interrupts disabled, as we muck with the
+ * request queue list.
*
* By this point, req->cmd is always either READ/WRITE, never READA,
* which is important for drive_stat_acct() above.
*/
-
static inline void add_request(request_queue_t * q, struct request * req,
struct list_head *insert_here)
{
@@ -622,9 +634,17 @@ void inline blkdev_release_request(struct request *req)
req->q = NULL;
/*
- * Request may not have originated from ll_rw_blk
+ * Request may not have originated from ll_rw_blk. if not,
+ * asumme it has free buffers and check waiters
*/
if (q) {
+ /*
+ * we've released enough buffers to start I/O again
+ */
+ if (waitqueue_active(&blk_buffers_wait)
+ && atomic_read(&queued_sectors) < low_queued_sectors)
+ wake_up(&blk_buffers_wait);
+
if (!list_empty(&q->request_freelist[rw])) {
blk_refill_freelist(q, rw);
list_add(&req->table, &q->request_freelist[rw]);
@@ -637,7 +657,7 @@ void inline blkdev_release_request(struct request *req)
*/
list_add(&req->table, &q->pending_freelist[rw]);
- if (++q->pending_free[rw] >= (QUEUE_NR_REQUESTS >> 4)) {
+ if (++q->pending_free[rw] >= batch_requests) {
int wake_up = q->pending_free[rw];
blk_refill_freelist(q, rw);
wake_up_nr(&q->wait_for_request, wake_up);
@@ -669,7 +689,7 @@ static void attempt_merge(request_queue_t * q,
* will have been updated to the appropriate number,
* and we shouldn't do it here too.
*/
- if(!q->merge_requests_fn(q, req, next, max_segments))
+ if (!q->merge_requests_fn(q, req, next, max_segments))
return;
q->elevator.elevator_merge_req_fn(req, next);
@@ -755,13 +775,13 @@ static int __make_request(request_queue_t * q, int rw,
max_sectors = get_max_sectors(bh->b_rdev);
again:
+ head = &q->queue_head;
/*
* Now we acquire the request spinlock, we have to be mega careful
* not to schedule or do something nonatomic
*/
spin_lock_irq(&io_request_lock);
- head = &q->queue_head;
insert_here = head->prev;
if (list_empty(head)) {
q->plug_device_fn(q, bh->b_rdev); /* is atomic */
@@ -780,6 +800,7 @@ again:
req->bhtail->b_reqnext = bh;
req->bhtail = bh;
req->nr_sectors = req->hard_nr_sectors += count;
+ blk_started_io(count);
drive_stat_acct(req->rq_dev, req->cmd, count, 0);
attempt_back_merge(q, req, max_sectors, max_segments);
goto out;
@@ -794,6 +815,7 @@ again:
req->current_nr_sectors = count;
req->sector = req->hard_sector = sector;
req->nr_sectors = req->hard_nr_sectors += count;
+ blk_started_io(count);
drive_stat_acct(req->rq_dev, req->cmd, count, 0);
attempt_front_merge(q, head, req, max_sectors, max_segments);
goto out;
@@ -817,10 +839,9 @@ again:
}
/*
- * Grab a free request from the freelist. Read first try their
- * own queue - if that is empty, we steal from the write list.
- * Writes must block if the write list is empty, and read aheads
- * are not crucial.
+ * Grab a free request from the freelist - if that is empty, check
+ * if we are doing read ahead and abort instead of blocking for
+ * a free slot.
*/
get_rq:
if (freereq) {
@@ -849,6 +870,7 @@ get_rq:
req->bh = bh;
req->bhtail = bh;
req->rq_dev = bh->b_rdev;
+ blk_started_io(count);
add_request(q, req, insert_here);
out:
if (freereq)
@@ -901,13 +923,13 @@ void generic_make_request (int rw, struct buffer_head * bh)
int major = MAJOR(bh->b_rdev);
request_queue_t *q;
- if (!bh->b_end_io) BUG();
+ if (!bh->b_end_io)
+ BUG();
+
if (blk_size[major]) {
unsigned long maxsector = (blk_size[major][MINOR(bh->b_rdev)] << 1) + 1;
- unsigned int sector, count;
-
- count = bh->b_size >> 9;
- sector = bh->b_rsector;
+ unsigned long sector = bh->b_rsector;
+ unsigned int count = bh->b_size >> 9;
if (maxsector < count || maxsector - count < sector) {
bh->b_state &= (1 << BH_Lock) | (1 << BH_Mapped);
@@ -918,7 +940,7 @@ void generic_make_request (int rw, struct buffer_head * bh)
when mounting a device. */
printk(KERN_INFO
"attempt to access beyond end of device\n");
- printk(KERN_INFO "%s: rw=%d, want=%d, limit=%d\n",
+ printk(KERN_INFO "%s: rw=%d, want=%ld, limit=%d\n",
kdevname(bh->b_rdev), rw,
(sector + count)>>1,
blk_size[major][MINOR(bh->b_rdev)]);
@@ -945,14 +967,13 @@ void generic_make_request (int rw, struct buffer_head * bh)
buffer_IO_error(bh);
break;
}
- }
- while (q->make_request_fn(q, rw, bh));
+ } while (q->make_request_fn(q, rw, bh));
}
/**
* submit_bh: submit a buffer_head to the block device later for I/O
- * @rw: whether to %READ or %WRITE, or mayve to %READA (read ahead)
+ * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
* @bh: The &struct buffer_head which describes the I/O
*
* submit_bh() is very similar in purpose to generic_make_request(), and
@@ -975,7 +996,7 @@ void submit_bh(int rw, struct buffer_head * bh)
* further remap this.
*/
bh->b_rdev = bh->b_dev;
- bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
+ bh->b_rsector = bh->b_blocknr * (bh->b_size >> 9);
generic_make_request(rw, bh);
@@ -1050,8 +1071,7 @@ void ll_rw_block(int rw, int nr, struct buffer_head * bhs[])
/* Verify requested block sizes. */
for (i = 0; i < nr; i++) {
- struct buffer_head *bh;
- bh = bhs[i];
+ struct buffer_head *bh = bhs[i];
if (bh->b_size % correct_size) {
printk(KERN_NOTICE "ll_rw_block: device %s: "
"only %d-char blocks implemented (%u)\n",
@@ -1068,8 +1088,17 @@ void ll_rw_block(int rw, int nr, struct buffer_head * bhs[])
}
for (i = 0; i < nr; i++) {
- struct buffer_head *bh;
- bh = bhs[i];
+ struct buffer_head *bh = bhs[i];
+
+ /*
+ * don't lock any more buffers if we are above the high
+ * water mark. instead start I/O on the queued stuff.
+ */
+ if (atomic_read(&queued_sectors) >= high_queued_sectors) {
+ run_task_queue(&tq_disk);
+ wait_event(blk_buffers_wait,
+ atomic_read(&queued_sectors) < low_queued_sectors);
+ }
/* Only one thread can actually submit the I/O. */
if (test_and_set_bit(BH_Lock, &bh->b_state))
@@ -1132,6 +1161,7 @@ int end_that_request_first (struct request *req, int uptodate, char *name)
if ((bh = req->bh) != NULL) {
nsect = bh->b_size >> 9;
+ blk_finished_io(nsect);
req->bh = bh->b_reqnext;
bh->b_reqnext = NULL;
bh->b_end_io(bh, uptodate);
@@ -1161,9 +1191,12 @@ void end_that_request_last(struct request *req)
blkdev_release_request(req);
}
+#define MB(kb) ((kb) << 10)
+
int __init blk_dev_init(void)
{
struct blk_dev_struct *dev;
+ int total_ram;
request_cachep = kmem_cache_create("blkdev_requests",
sizeof(struct request),
@@ -1178,6 +1211,51 @@ int __init blk_dev_init(void)
memset(ro_bits,0,sizeof(ro_bits));
memset(max_readahead, 0, sizeof(max_readahead));
memset(max_sectors, 0, sizeof(max_sectors));
+
+ atomic_set(&queued_sectors, 0);
+ total_ram = nr_free_pages() << (PAGE_SHIFT - 10);
+
+ /*
+ * Try to keep 128MB max hysteris. If not possible,
+ * use half of RAM
+ */
+ high_queued_sectors = (total_ram * 2) / 3;
+ low_queued_sectors = high_queued_sectors - MB(128);
+ if (low_queued_sectors < 0)
+ low_queued_sectors = total_ram / 2;
+
+ /*
+ * for big RAM machines (>= 384MB), use more for I/O
+ */
+ if (total_ram >= MB(384)) {
+ high_queued_sectors = (total_ram * 4) / 5;
+ low_queued_sectors = high_queued_sectors - MB(128);
+ }
+
+ /*
+ * make it sectors (512b)
+ */
+ high_queued_sectors <<= 1;
+ low_queued_sectors <<= 1;
+
+ /*
+ * Scale free request slots per queue too
+ */
+ total_ram = (total_ram + MB(32) - 1) & ~(MB(32) - 1);
+ if ((queue_nr_requests = total_ram >> 9) > QUEUE_NR_REQUESTS)
+ queue_nr_requests = QUEUE_NR_REQUESTS;
+
+ /*
+ * adjust batch frees according to queue length, with upper limit
+ */
+ if ((batch_requests = queue_nr_requests >> 3) > 32)
+ batch_requests = 32;
+
+ printk("block: queued sectors max/low %dkB/%dkB, %d slots per queue\n",
+ high_queued_sectors / 2,
+ low_queued_sectors / 2,
+ queue_nr_requests);
+
#ifdef CONFIG_AMIGA_Z2RAM
z2_init();
#endif
@@ -1300,3 +1378,4 @@ EXPORT_SYMBOL(blk_queue_make_request);
EXPORT_SYMBOL(generic_make_request);
EXPORT_SYMBOL(blkdev_release_request);
EXPORT_SYMBOL(generic_unplug_device);
+EXPORT_SYMBOL(queued_sectors);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 04ccfe6d6d75..dcde48437493 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -362,6 +362,7 @@ static Scsi_Cmnd *__scsi_end_request(Scsi_Cmnd * SCpnt,
struct request *req;
struct buffer_head *bh;
Scsi_Device * SDpnt;
+ int nsect;
ASSERT_LOCK(&io_request_lock, 0);
@@ -373,11 +374,13 @@ static Scsi_Cmnd *__scsi_end_request(Scsi_Cmnd * SCpnt,
}
do {
if ((bh = req->bh) != NULL) {
+ nsect = bh->b_size >> 9;
+ blk_finished_io(nsect);
req->bh = bh->b_reqnext;
- req->nr_sectors -= bh->b_size >> 9;
- req->sector += bh->b_size >> 9;
+ req->nr_sectors -= nsect;
+ req->sector += nsect;
bh->b_reqnext = NULL;
- sectors -= bh->b_size >> 9;
+ sectors -= nsect;
bh->b_end_io(bh, uptodate);
if ((bh = req->bh) != NULL) {
req->current_nr_sectors = bh->b_size >> 9;
diff --git a/fs/inode.c b/fs/inode.c
index c85ed6426331..7939267f4502 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -134,12 +134,17 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
void __mark_inode_dirty(struct inode *inode, int flags)
{
struct super_block * sb = inode->i_sb;
+
if (sb) {
/* Don't do this for I_DIRTY_PAGES - that doesn't actually dirty the inode itself */
if (flags & (I_DIRTY | I_DIRTY_SYNC)) {
if (sb->s_op && sb->s_op->dirty_inode)
sb->s_op->dirty_inode(inode);
}
+ /* avoid the locking if we can */
+ if ((inode->i_state & flags) != flags) {
+ return ;
+ }
spin_lock(&inode_lock);
if ((inode->i_state & flags) != flags) {
inode->i_state |= flags;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 00c16769e002..f95e9f6241cc 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -17,6 +17,7 @@
#include <linux/elf.h>
#include <linux/elfcore.h>
#include <linux/vmalloc.h>
+#include <linux/highmem.h>
#include <asm/uaccess.h>
#include <asm/io.h>
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0ea8f7064ba7..01cd38a8368e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -64,9 +64,10 @@ typedef void (plug_device_fn) (request_queue_t *q, kdev_t device);
typedef void (unplug_device_fn) (void *q);
/*
- * Default nr free requests per queue
+ * Default nr free requests per queue, ll_rw_blk will scale it down
+ * according to available RAM at init time
*/
-#define QUEUE_NR_REQUESTS 512
+#define QUEUE_NR_REQUESTS 8192
struct request_queue
{
@@ -176,6 +177,8 @@ extern int * max_sectors[MAX_BLKDEV];
extern int * max_segments[MAX_BLKDEV];
+extern atomic_t queued_sectors;
+
#define MAX_SEGMENTS 128
#define MAX_SECTORS (MAX_SEGMENTS*8)
@@ -203,5 +206,14 @@ static inline int get_hardsect_size(kdev_t dev)
return 512;
}
+#define blk_finished_io(nsects) \
+ atomic_sub(nsects, &queued_sectors); \
+ if (atomic_read(&queued_sectors) < 0) { \
+ printk("block: queued_sectors < 0\n"); \
+ atomic_set(&queued_sectors, 0); \
+ }
+
+#define blk_started_io(nsects) \
+ atomic_add(nsects, &queued_sectors);
#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f2d99107bf79..686aef31a600 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -825,20 +825,17 @@ struct super_operations {
extern void __mark_inode_dirty(struct inode *, int);
static inline void mark_inode_dirty(struct inode *inode)
{
- if ((inode->i_state & I_DIRTY) != I_DIRTY)
- __mark_inode_dirty(inode, I_DIRTY);
+ __mark_inode_dirty(inode, I_DIRTY);
}
static inline void mark_inode_dirty_sync(struct inode *inode)
{
- if (!(inode->i_state & I_DIRTY_SYNC))
- __mark_inode_dirty(inode, I_DIRTY_SYNC);
+ __mark_inode_dirty(inode, I_DIRTY_SYNC);
}
static inline void mark_inode_dirty_pages(struct inode *inode)
{
- if (inode && !(inode->i_state & I_DIRTY_PAGES))
- __mark_inode_dirty(inode, I_DIRTY_PAGES);
+ __mark_inode_dirty(inode, I_DIRTY_PAGES);
}
struct dquot_operations {
diff --git a/mm/filemap.c b/mm/filemap.c
index cd97dc77f5ad..4c89ad3e9156 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -143,7 +143,8 @@ void __set_page_dirty(struct page *page)
list_add(&page->list, &mapping->dirty_pages);
spin_unlock(&pagecache_lock);
- mark_inode_dirty_pages(mapping->host);
+ if (mapping->host)
+ mark_inode_dirty_pages(mapping->host);
}
/**
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 62ce5f1ffed8..93edab662caa 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -9,6 +9,7 @@
#include <linux/malloc.h>
#include <linux/vmalloc.h>
#include <linux/spinlock.h>
+#include <linux/highmem.h>
#include <linux/smp_lock.h>
#include <asm/uaccess.h>