diff options
41 files changed, 935 insertions, 360 deletions
diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c index 42ce2febe8b7..fcec73a7e379 100644 --- a/arch/i386/boot/compressed/misc.c +++ b/arch/i386/boot/compressed/misc.c @@ -121,7 +121,7 @@ static int vidport; static int lines, cols; #ifdef CONFIG_MULTIQUAD -static void * const xquad_portio = NULL; +static void * xquad_portio = NULL; #endif #include "../../../../lib/inflate.c" diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 8a04f3d2c8aa..9d513dc1ceb2 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -1060,11 +1060,11 @@ static void __init smp_boot_cpus(unsigned int max_cpus) if (clustered_apic_mode && (numnodes > 1)) { printk("Remapping cross-quad port I/O for %d quads\n", numnodes); + xquad_portio = ioremap (XQUAD_PORTIO_BASE, + numnodes * XQUAD_PORTIO_QUAD); printk("xquad_portio vaddr 0x%08lx, len %08lx\n", (u_long) xquad_portio, - (u_long) numnodes * XQUAD_PORTIO_LEN); - xquad_portio = ioremap (XQUAD_PORTIO_BASE, - numnodes * XQUAD_PORTIO_LEN); + (u_long) numnodes * XQUAD_PORTIO_QUAD); } /* diff --git a/arch/i386/kernel/sys_i386.c b/arch/i386/kernel/sys_i386.c index f7042004ead4..69b1805151ad 100644 --- a/arch/i386/kernel/sys_i386.c +++ b/arch/i386/kernel/sys_i386.c @@ -272,10 +272,9 @@ get_addr(unsigned long addr, unsigned long len) return -ENOMEM; if (!vma || ((addr + len) < vma->vm_start)) goto found_addr; - addr = vma->vm_end; + addr = HPAGE_ALIGN(vma->vm_end); } found_addr: - addr = HPAGE_ALIGN(addr); return addr; } diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 2b40242f3ab2..eff7ee947ea7 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -9,9 +9,9 @@ # export-objs := elevator.o ll_rw_blk.o loop.o genhd.o acsi.o \ - block_ioctl.o + block_ioctl.o deadline-iosched.o -obj-y := elevator.o ll_rw_blk.o blkpg.o genhd.o block_ioctl.o +obj-y := elevator.o ll_rw_blk.o blkpg.o genhd.o block_ioctl.o deadline-iosched.o obj-$(CONFIG_MAC_FLOPPY) += swim3.o obj-$(CONFIG_BLK_DEV_FD) += floppy.o diff --git a/drivers/block/deadline-iosched.c b/drivers/block/deadline-iosched.c new file mode 100644 index 000000000000..a32d24ef7c50 --- /dev/null +++ b/drivers/block/deadline-iosched.c @@ -0,0 +1,557 @@ +/* + * linux/drivers/block/deadline-iosched.c + * + * Deadline i/o scheduler. + * + * Copyright (C) 2002 Jens Axboe <axboe@suse.de> + */ +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/blkdev.h> +#include <linux/elevator.h> +#include <linux/bio.h> +#include <linux/blk.h> +#include <linux/config.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/compiler.h> +#include <linux/hash.h> + +/* + * feel free to try other values :-). read_expire value is the timeout for + * reads, our goal is to start a request "around" the time when it expires. + * fifo_batch is how many steps along the sorted list we will take when the + * front fifo request expires. + */ +static int read_expire = HZ / 2; /* 500ms start timeout */ +static int fifo_batch = 64; /* 4 seeks, or 64 contig */ +static int seek_cost = 16; /* seek is 16 times more expensive */ + +/* + * how many times reads are allowed to starve writes + */ +static int writes_starved = 2; + +static const int deadline_hash_shift = 8; +#define DL_HASH_BLOCK(sec) ((sec) >> 3) +#define DL_HASH_FN(sec) (hash_long(DL_HASH_BLOCK((sec)), deadline_hash_shift)) +#define DL_HASH_ENTRIES (1 << deadline_hash_shift) + +#define DL_INVALIDATE_HASH(dd) \ + do { \ + if (!++(dd)->hash_valid_count) \ + (dd)->hash_valid_count = 1; \ + } while (0) + +struct deadline_data { + /* + * run time data + */ + struct list_head sort_list[2]; /* sorted listed */ + struct list_head read_fifo; /* fifo list */ + struct list_head *dispatch; /* driver dispatch queue */ + struct list_head *hash; /* request hash */ + sector_t last_sector; /* last sector sent to drive */ + unsigned long hash_valid_count; /* barrier hash count */ + unsigned int starved; /* writes starved */ + + /* + * settings that change how the i/o scheduler behaves + */ + unsigned int fifo_batch; + unsigned long read_expire; + unsigned int seek_cost; + unsigned int writes_starved; +}; + +/* + * pre-request data. + */ +struct deadline_rq { + struct list_head fifo; + struct list_head hash; + unsigned long hash_valid_count; + struct request *request; + unsigned long expires; +}; + +static kmem_cache_t *drq_pool; + +#define RQ_DATA(rq) ((struct deadline_rq *) (rq)->elevator_private) + +/* + * rq hash + */ +static inline void __deadline_del_rq_hash(struct deadline_rq *drq) +{ + drq->hash_valid_count = 0; + list_del_init(&drq->hash); +} + +#define ON_HASH(drq) (drq)->hash_valid_count +static inline void deadline_del_rq_hash(struct deadline_rq *drq) +{ + if (ON_HASH(drq)) + __deadline_del_rq_hash(drq); +} + +static inline void +deadline_add_rq_hash(struct deadline_data *dd, struct deadline_rq *drq) +{ + struct request *rq = drq->request; + + BUG_ON(ON_HASH(drq)); + + drq->hash_valid_count = dd->hash_valid_count; + list_add(&drq->hash, &dd->hash[DL_HASH_FN(rq->sector +rq->nr_sectors)]); +} + +#define list_entry_hash(ptr) list_entry((ptr), struct deadline_rq, hash) +static struct request * +deadline_find_hash(struct deadline_data *dd, sector_t offset) +{ + struct list_head *hash_list = &dd->hash[DL_HASH_FN(offset)]; + struct list_head *entry, *next = hash_list->next; + struct deadline_rq *drq; + struct request *rq = NULL; + + while ((entry = next) != hash_list) { + next = entry->next; + drq = list_entry_hash(entry); + + BUG_ON(!drq->hash_valid_count); + + if (!rq_mergeable(drq->request) + || drq->hash_valid_count != dd->hash_valid_count) { + __deadline_del_rq_hash(drq); + continue; + } + + if (drq->request->sector + drq->request->nr_sectors == offset) { + rq = drq->request; + break; + } + } + + return rq; +} + +static int +deadline_merge(request_queue_t *q, struct request **req, struct bio *bio) +{ + struct deadline_data *dd = q->elevator.elevator_data; + const int data_dir = bio_data_dir(bio); + struct list_head *entry, *sort_list; + struct deadline_rq *drq; + struct request *__rq; + int ret = ELEVATOR_NO_MERGE; + + /* + * try last_merge to avoid going to hash + */ + ret = elv_try_last_merge(q, req, bio); + if (ret != ELEVATOR_NO_MERGE) + goto out; + + /* + * see if the merge hash can satisfy a back merge + */ + if ((__rq = deadline_find_hash(dd, bio->bi_sector))) { + BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector); + + if (elv_rq_merge_ok(__rq, bio)) { + *req = __rq; + q->last_merge = &__rq->queuelist; + ret = ELEVATOR_BACK_MERGE; + goto out_ret; + } + } + + entry = sort_list = &dd->sort_list[data_dir]; + while ((entry = entry->prev) != sort_list) { + __rq = list_entry_rq(entry); + drq = RQ_DATA(__rq); + + BUG_ON(__rq->flags & REQ_STARTED); + + if (!(__rq->flags & REQ_CMD)) + continue; + + if (!*req && bio_rq_in_between(bio, __rq, sort_list)) + *req = __rq; + + if (__rq->flags & REQ_BARRIER) + break; + + /* + * checking for a front merge, hash will miss those + */ + if (__rq->sector - bio_sectors(bio) == bio->bi_sector) { + ret = elv_try_merge(__rq, bio); + if (ret != ELEVATOR_NO_MERGE) { + *req = __rq; + q->last_merge = &__rq->queuelist; + break; + } + } + } + +out: + if (ret != ELEVATOR_NO_MERGE) { + struct deadline_rq *drq = RQ_DATA(*req); + + deadline_del_rq_hash(drq); + deadline_add_rq_hash(dd, drq); + } +out_ret: + return ret; +} + +static void +deadline_merge_request(request_queue_t *q, struct request *req, struct request *next) +{ + struct deadline_data *dd = q->elevator.elevator_data; + struct deadline_rq *drq = RQ_DATA(req); + struct deadline_rq *dnext = RQ_DATA(next); + + BUG_ON(!drq); + BUG_ON(!dnext); + + deadline_del_rq_hash(drq); + deadline_add_rq_hash(dd, drq); + + /* + * if dnext expires before drq, assign it's expire time to drq + * and move into dnext position (dnext will be deleted) in fifo + */ + if (!list_empty(&drq->fifo) && !list_empty(&dnext->fifo)) { + if (time_before(dnext->expires, drq->expires)) { + list_move(&drq->fifo, &dnext->fifo); + drq->expires = dnext->expires; + } + } +} + +/* + * move request from sort list to dispatch queue. maybe remove from rq hash + * here too? + */ +static inline void +deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq) +{ + struct deadline_rq *drq = RQ_DATA(rq); + + list_move_tail(&rq->queuelist, dd->dispatch); + list_del_init(&drq->fifo); +} + +/* + * move along sort list and move entries to dispatch queue, starting from rq + */ +static void deadline_move_requests(struct deadline_data *dd, struct request *rq) +{ + struct list_head *sort_head = &dd->sort_list[rq_data_dir(rq)]; + sector_t last_sec = dd->last_sector; + int batch_count = dd->fifo_batch; + + do { + struct list_head *nxt = rq->queuelist.next; + + /* + * take it off the sort and fifo list, move + * to dispatch queue + */ + deadline_move_to_dispatch(dd, rq); + + if (rq->sector == last_sec) + batch_count--; + else + batch_count -= dd->seek_cost; + + if (nxt == sort_head) + break; + + last_sec = rq->sector + rq->nr_sectors; + rq = list_entry_rq(nxt); + } while (batch_count > 0); +} + +/* + * returns 0 if there are no expired reads on the fifo, 1 otherwise + */ +#define list_entry_fifo(ptr) list_entry((ptr), struct deadline_rq, fifo) +static inline int deadline_check_fifo(struct deadline_data *dd) +{ + struct deadline_rq *drq; + + if (list_empty(&dd->read_fifo)) + return 0; + + drq = list_entry_fifo(dd->read_fifo.next); + if (time_before(jiffies, drq->expires)) + return 0; + + return 1; +} + +static struct request *deadline_next_request(request_queue_t *q) +{ + struct deadline_data *dd = q->elevator.elevator_data; + struct deadline_rq *drq; + struct list_head *nxt; + struct request *rq; + int writes; + + /* + * if still requests on the dispatch queue, just grab the first one + */ + if (!list_empty(&q->queue_head)) { +dispatch: + rq = list_entry_rq(q->queue_head.next); + dd->last_sector = rq->sector + rq->nr_sectors; + return rq; + } + + writes = !list_empty(&dd->sort_list[WRITE]); + + /* + * if we have expired entries on the fifo list, move some to dispatch + */ + if (deadline_check_fifo(dd)) { + if (writes && (dd->starved++ >= dd->writes_starved)) + goto dispatch_writes; + + nxt = dd->read_fifo.next; + drq = list_entry_fifo(nxt); + deadline_move_requests(dd, drq->request); + goto dispatch; + } + + if (!list_empty(&dd->sort_list[READ])) { + if (writes && (dd->starved++ >= dd->writes_starved)) + goto dispatch_writes; + + nxt = dd->sort_list[READ].next; + deadline_move_requests(dd, list_entry_rq(nxt)); + goto dispatch; + } + + /* + * either there are no reads expired or on sort list, or the reads + * have starved writes for too long. dispatch some writes + */ + if (writes) { +dispatch_writes: + nxt = dd->sort_list[WRITE].next; + deadline_move_requests(dd, list_entry_rq(nxt)); + dd->starved = 0; + goto dispatch; + } + + BUG_ON(!list_empty(&dd->sort_list[READ])); + BUG_ON(writes); + return NULL; +} + +static void +deadline_add_request(request_queue_t *q, struct request *rq, struct list_head *insert_here) +{ + struct deadline_data *dd = q->elevator.elevator_data; + struct deadline_rq *drq = RQ_DATA(rq); + const int data_dir = rq_data_dir(rq); + + /* + * flush hash on barrier insert, as not to allow merges before a + * barrier. + */ + if (unlikely(rq->flags & REQ_BARRIER)) { + DL_INVALIDATE_HASH(dd); + q->last_merge = NULL; + } + + /* + * add to sort list + */ + if (!insert_here) + insert_here = dd->sort_list[data_dir].prev; + + list_add(&rq->queuelist, insert_here); + + if (unlikely(!(rq->flags & REQ_CMD))) + return; + + if (rq_mergeable(rq)) { + deadline_add_rq_hash(dd, drq); + + if (!q->last_merge) + q->last_merge = &rq->queuelist; + } + + if (data_dir == READ) { + /* + * set expire time and add to fifo list + */ + drq->expires = jiffies + dd->read_expire; + list_add_tail(&drq->fifo, &dd->read_fifo); + } +} + +static void deadline_remove_request(request_queue_t *q, struct request *rq) +{ + struct deadline_rq *drq = RQ_DATA(rq); + + if (drq) { + list_del_init(&drq->fifo); + deadline_del_rq_hash(drq); + } +} + +static int deadline_queue_empty(request_queue_t *q) +{ + struct deadline_data *dd = q->elevator.elevator_data; + + if (!list_empty(&q->queue_head) || !list_empty(&dd->sort_list[READ]) + || !list_empty(&dd->sort_list[WRITE])) + return 0; + + BUG_ON(!list_empty(&dd->read_fifo)); + return 1; +} + +static struct list_head * +deadline_get_sort_head(request_queue_t *q, struct request *rq) +{ + struct deadline_data *dd = q->elevator.elevator_data; + + return &dd->sort_list[rq_data_dir(rq)]; +} + +static void deadline_exit(request_queue_t *q, elevator_t *e) +{ + struct deadline_data *dd = e->elevator_data; + struct deadline_rq *drq; + struct request *rq; + int i; + + BUG_ON(!list_empty(&dd->read_fifo)); + BUG_ON(!list_empty(&dd->sort_list[READ])); + BUG_ON(!list_empty(&dd->sort_list[WRITE])); + + for (i = READ; i <= WRITE; i++) { + struct request_list *rl = &q->rq[i]; + struct list_head *entry = &rl->free; + + if (list_empty(&rl->free)) + continue; + + while ((entry = entry->next) != &rl->free) { + rq = list_entry_rq(entry); + + if ((drq = RQ_DATA(rq)) == NULL) + continue; + + rq->elevator_private = NULL; + kmem_cache_free(drq_pool, drq); + } + } + + kfree(dd->hash); + kfree(dd); +} + +/* + * initialize elevator private data (deadline_data), and alloc a drq for + * each request on the free lists + */ +static int deadline_init(request_queue_t *q, elevator_t *e) +{ + struct deadline_data *dd; + struct deadline_rq *drq; + struct request *rq; + int i, ret = 0; + + if (!drq_pool) + return -ENOMEM; + + dd = kmalloc(sizeof(*dd), GFP_KERNEL); + if (!dd) + return -ENOMEM; + memset(dd, 0, sizeof(*dd)); + + dd->hash = kmalloc(sizeof(struct list_head)*DL_HASH_ENTRIES,GFP_KERNEL); + if (!dd->hash) { + kfree(dd); + return -ENOMEM; + } + + for (i = 0; i < DL_HASH_ENTRIES; i++) + INIT_LIST_HEAD(&dd->hash[i]); + + INIT_LIST_HEAD(&dd->read_fifo); + INIT_LIST_HEAD(&dd->sort_list[READ]); + INIT_LIST_HEAD(&dd->sort_list[WRITE]); + dd->dispatch = &q->queue_head; + dd->fifo_batch = fifo_batch; + dd->read_expire = read_expire; + dd->seek_cost = seek_cost; + dd->hash_valid_count = 1; + dd->writes_starved = writes_starved; + e->elevator_data = dd; + + for (i = READ; i <= WRITE; i++) { + struct request_list *rl = &q->rq[i]; + struct list_head *entry = &rl->free; + + if (list_empty(&rl->free)) + continue; + + while ((entry = entry->next) != &rl->free) { + rq = list_entry_rq(entry); + + drq = kmem_cache_alloc(drq_pool, GFP_KERNEL); + if (!drq) { + ret = -ENOMEM; + break; + } + + memset(drq, 0, sizeof(*drq)); + INIT_LIST_HEAD(&drq->fifo); + INIT_LIST_HEAD(&drq->hash); + drq->request = rq; + rq->elevator_private = drq; + } + } + + if (ret) + deadline_exit(q, e); + + return ret; +} + +static int __init deadline_slab_setup(void) +{ + drq_pool = kmem_cache_create("deadline_drq", sizeof(struct deadline_rq), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + + if (!drq_pool) + panic("deadline: can't init slab pool\n"); + + return 0; +} + +module_init(deadline_slab_setup); + +elevator_t iosched_deadline = { + .elevator_merge_fn = deadline_merge, + .elevator_merge_req_fn = deadline_merge_request, + .elevator_next_req_fn = deadline_next_request, + .elevator_add_req_fn = deadline_add_request, + .elevator_remove_req_fn = deadline_remove_request, + .elevator_queue_empty_fn = deadline_queue_empty, + .elevator_get_sort_head_fn = deadline_get_sort_head, + .elevator_init_fn = deadline_init, + .elevator_exit_fn = deadline_exit, +}; + +EXPORT_SYMBOL(iosched_deadline); diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c index 10729a1f0c1c..68f2ded9d86e 100644 --- a/drivers/block/elevator.c +++ b/drivers/block/elevator.c @@ -157,114 +157,6 @@ inline int elv_try_last_merge(request_queue_t *q, struct request **req, return ret; } -static int bio_rq_before(struct bio *bio, struct request *rq) -{ - if (!kdev_same(to_kdev_t(bio->bi_bdev->bd_dev), rq->rq_dev)) - return 0; - return bio->bi_sector < rq->sector; -} - -/* - * elevator_linux starts here - */ -int elevator_linus_merge(request_queue_t *q, struct request **req, - struct bio *bio) -{ - struct list_head *entry, *good; - struct request *__rq; - int ret; - - if ((ret = elv_try_last_merge(q, req, bio))) - return ret; - - entry = &q->queue_head; - good = &q->queue_head; - ret = ELEVATOR_NO_MERGE; - while ((entry = entry->prev) != &q->queue_head) { - __rq = list_entry_rq(entry); - - if (__rq->flags & (REQ_BARRIER | REQ_STARTED)) - break; - if (!(__rq->flags & REQ_CMD)) - break; - - if (bio_data_dir(bio) != rq_data_dir(__rq)) { - if (bio_data_dir(bio) == WRITE) - break; - good = entry->prev; - continue; - } - - ret = elv_try_merge(__rq, bio); - if (ret) { - *req = __rq; - q->last_merge = &__rq->queuelist; - return ret; - } - - if (bio_rq_before(bio, __rq)) - good = entry->prev; - - } - - if (good != &q->queue_head) - *req = list_entry_rq(good); - - return ELEVATOR_NO_MERGE; -} - -void elevator_linus_merge_req(request_queue_t *q, struct request *req, - struct request *next) -{ - if (elv_linus_sequence(next) < elv_linus_sequence(req)) - elv_linus_sequence(req) = elv_linus_sequence(next); -} - -void elevator_linus_add_request(request_queue_t *q, struct request *rq, - struct list_head *insert_here) -{ - elevator_t *e = &q->elevator; - int lat = 0, *latency = e->elevator_data; - - if (!insert_here) - insert_here = q->queue_head.prev; - - if (!(rq->flags & REQ_BARRIER)) - lat = latency[rq_data_dir(rq)]; - - elv_linus_sequence(rq) = lat; - - list_add(&rq->queuelist, insert_here); - - /* - * new merges must not precede this barrier - */ - if (rq->flags & REQ_BARRIER) - q->last_merge = NULL; - else if (!q->last_merge) - q->last_merge = &rq->queuelist; -} - -int elevator_linus_init(request_queue_t *q, elevator_t *e) -{ - int *latency; - - latency = kmalloc(2 * sizeof(int), GFP_KERNEL); - if (!latency) - return -ENOMEM; - - latency[READ] = 1024; - latency[WRITE] = 2048; - - e->elevator_data = latency; - return 0; -} - -void elevator_linus_exit(request_queue_t *q, elevator_t *e) -{ - kfree(e->elevator_data); -} - /* * elevator noop * @@ -442,15 +334,6 @@ inline struct list_head *elv_get_sort_head(request_queue_t *q, return &q->queue_head; } -elevator_t elevator_linus = { - elevator_merge_fn: elevator_linus_merge, - elevator_merge_req_fn: elevator_linus_merge_req, - elevator_next_req_fn: elevator_noop_next_request, - elevator_add_req_fn: elevator_linus_add_request, - elevator_init_fn: elevator_linus_init, - elevator_exit_fn: elevator_linus_exit, -}; - elevator_t elevator_noop = { elevator_merge_fn: elevator_noop_merge, elevator_next_req_fn: elevator_noop_next_request, @@ -459,7 +342,6 @@ elevator_t elevator_noop = { module_init(elevator_global_init); -EXPORT_SYMBOL(elevator_linus); EXPORT_SYMBOL(elevator_noop); EXPORT_SYMBOL(__elv_add_request); diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index 605f474d2f16..a2595200d838 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -1175,7 +1175,7 @@ int blk_init_queue(request_queue_t *q, request_fn_proc *rfn, spinlock_t *lock) if (blk_init_free_list(q)) return -ENOMEM; - if ((ret = elevator_init(q, &q->elevator, elevator_linus))) { + if ((ret = elevator_init(q, &q->elevator, iosched_deadline))) { blk_cleanup_queue(q); return ret; } @@ -1233,24 +1233,23 @@ static struct request *get_request(request_queue_t *q, int rw) */ static struct request *get_request_wait(request_queue_t *q, int rw) { - DECLARE_WAITQUEUE(wait, current); + DEFINE_WAIT(wait); struct request_list *rl = &q->rq[rw]; struct request *rq; spin_lock_prefetch(q->queue_lock); generic_unplug_device(q); - add_wait_queue_exclusive(&rl->wait, &wait); do { - set_current_state(TASK_UNINTERRUPTIBLE); + prepare_to_wait_exclusive(&rl->wait, &wait, + TASK_UNINTERRUPTIBLE); if (!rl->count) schedule(); + finish_wait(&rl->wait, &wait); spin_lock_irq(q->queue_lock); rq = get_request(q, rw); spin_unlock_irq(q->queue_lock); } while (rq == NULL); - remove_wait_queue(&rl->wait, &wait); - current->state = TASK_RUNNING; return rq; } @@ -1460,18 +1459,16 @@ void blk_put_request(struct request *req) */ void blk_congestion_wait(int rw, long timeout) { - DECLARE_WAITQUEUE(wait, current); + DEFINE_WAIT(wait); struct congestion_state *cs = &congestion_states[rw]; if (atomic_read(&cs->nr_congested_queues) == 0) return; blk_run_queues(); - set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&cs->wqh, &wait); + prepare_to_wait(&cs->wqh, &wait, TASK_UNINTERRUPTIBLE); if (atomic_read(&cs->nr_congested_queues) != 0) schedule_timeout(timeout); - set_current_state(TASK_RUNNING); - remove_wait_queue(&cs->wqh, &wait); + finish_wait(&cs->wqh, &wait); } /* diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ccfa6f776ef0..d55beac14697 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -157,18 +157,12 @@ struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = { #define MAX_DISK_SIZE 1024*1024*1024 -static unsigned long -compute_loop_size(struct loop_device *lo, struct dentry * lo_dentry) -{ - loff_t size = lo_dentry->d_inode->i_mapping->host->i_size; - return (size - lo->lo_offset) >> BLOCK_SIZE_BITS; -} - static void figure_loop_size(struct loop_device *lo) { - set_capacity(disks + lo->lo_number, compute_loop_size(lo, - lo->lo_backing_file->f_dentry)); - + loff_t size = lo->lo_backing_file->f_dentry->d_inode->i_size; + + set_capacity(disks + lo->lo_number, + (size - lo->lo_offset) >> 9); } static inline int lo_do_transfer(struct loop_device *lo, int cmd, char *rbuf, diff --git a/drivers/ide/pci/cy82c693.c b/drivers/ide/pci/cy82c693.c index 06b5495a79fd..cbb0abb97407 100644 --- a/drivers/ide/pci/cy82c693.c +++ b/drivers/ide/pci/cy82c693.c @@ -338,6 +338,9 @@ static void cy82c693_tune_drive (ide_drive_t *drive, u8 pio) */ unsigned int __init init_chipset_cy82c693(struct pci_dev *dev, const char *name) { + if (PCI_FUNC(dev->devfn) != 1) + return 0; + #ifdef CY82C693_SETDMA_CLOCK u8 data = 0; #endif /* CY82C693_SETDMA_CLOCK */ @@ -411,20 +414,30 @@ void __init init_hwif_cy82c693(ide_hwif_t *hwif) #endif /* CONFIG_BLK_DEV_IDEDMA */ } -void __init init_dma_cy82c693 (ide_hwif_t *hwif, unsigned long dmabase) +static __initdata ide_hwif_t *primary; + +void __init init_iops_cy82c693(ide_hwif_t *hwif) { - ide_setup_dma(hwif, dmabase, 8); + if (PCI_FUNC(hwif->pci_dev->devfn) == 1) + primary = hwif; + else { + hwif->mate = primary; + hwif->channel = 1; + } } -extern void ide_setup_pci_device(struct pci_dev *, ide_pci_device_t *); - static int __devinit cy82c693_init_one(struct pci_dev *dev, const struct pci_device_id *id) { ide_pci_device_t *d = &cy82c693_chipsets[id->driver_data]; - if ((!(PCI_FUNC(dev->devfn) & 1) || - (!((dev->class >> 8) == PCI_CLASS_STORAGE_IDE)))) - return 0; /* CY82C693 is more than only a IDE controller */ - ide_setup_pci_device(dev, d); + struct pci_dev *dev2; + + /* CY82C693 is more than only a IDE controller. + Function 1 is primary IDE channel, function 2 - secondary. */ + if ((dev->class >> 8) == PCI_CLASS_STORAGE_IDE && + PCI_FUNC(dev->devfn) == 1) { + dev2 = pci_find_slot(dev->bus->number, dev->devfn + 1); + ide_setup_pci_devices(dev, dev2, d); + } return 0; } diff --git a/drivers/ide/pci/cy82c693.h b/drivers/ide/pci/cy82c693.h index b5c6f9652d51..d7c8d19a8523 100644 --- a/drivers/ide/pci/cy82c693.h +++ b/drivers/ide/pci/cy82c693.h @@ -66,7 +66,7 @@ typedef struct pio_clocks_s { extern unsigned int init_chipset_cy82c693(struct pci_dev *, const char *); extern void init_hwif_cy82c693(ide_hwif_t *); -extern void init_dma_cy82c693(ide_hwif_t *, unsigned long); +extern void init_iops_cy82c693(ide_hwif_t *); static ide_pci_device_t cy82c693_chipsets[] __initdata = { { /* 0 */ @@ -74,10 +74,10 @@ static ide_pci_device_t cy82c693_chipsets[] __initdata = { device: PCI_DEVICE_ID_CONTAQ_82C693, name: "CY82C693", init_chipset: init_chipset_cy82c693, - init_iops: NULL, + init_iops: init_iops_cy82c693, init_hwif: init_hwif_cy82c693, - init_dma: init_dma_cy82c693, - channels: 2, + init_dma: NULL, + channels: 1, autodma: AUTODMA, enablebits: {{0x00,0x00,0x00}, {0x00,0x00,0x00}}, bootable: ON_BOARD, diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c index 29dfacd9b29d..8ef4fc169dec 100644 --- a/drivers/ide/setup-pci.c +++ b/drivers/ide/setup-pci.c @@ -250,6 +250,7 @@ second_chance_to_dma: switch(dev->device) { case PCI_DEVICE_ID_AL_M5219: + case PCI_DEVICE_ID_AL_M5229: case PCI_DEVICE_ID_AMD_VIPER_7409: case PCI_DEVICE_ID_CMD_643: case PCI_DEVICE_ID_SERVERWORKS_CSB5IDE: diff --git a/drivers/pnp/pnpbios_proc.c b/drivers/pnp/pnpbios_proc.c index 6d7c5e1b1321..56130105496e 100644 --- a/drivers/pnp/pnpbios_proc.c +++ b/drivers/pnp/pnpbios_proc.c @@ -68,6 +68,7 @@ static int proc_read_escdinfo(char *buf, char **start, off_t pos, ); } +#define MAX_SANE_ESCD_SIZE (32*1024) static int proc_read_escd(char *buf, char **start, off_t pos, int count, int *eof, void *data) { @@ -79,8 +80,8 @@ static int proc_read_escd(char *buf, char **start, off_t pos, return -EIO; /* sanity check */ - if (escd.escd_size > (32*1024)) { - printk(KERN_ERR "PnPBIOS: proc_read_escd: ESCD size is too great\n"); + if (escd.escd_size > MAX_SANE_ESCD_SIZE) { + printk(KERN_ERR "PnPBIOS: proc_read_escd: ESCD size reported by BIOS escd_info call is too great\n"); return -EFBIG; } @@ -90,7 +91,14 @@ static int proc_read_escd(char *buf, char **start, off_t pos, if (pnp_bios_read_escd(tmpbuf, escd.nv_storage_base)) return -EIO; - escd_size = (unsigned char)(buf[0]) + (unsigned char)(buf[1])*256; + escd_size = (unsigned char)(tmpbuf[0]) + (unsigned char)(tmpbuf[1])*256; + + /* sanity check */ + if (escd_size > MAX_SANE_ESCD_SIZE) { + printk(KERN_ERR "PnPBIOS: proc_read_escd: ESCD size reported by BIOS read_escd call is too great\n"); + return -EFBIG; + } + escd_left_to_read = escd_size - pos; if (escd_left_to_read < 0) escd_left_to_read = 0; if (escd_left_to_read == 0) *eof = 1; diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c index 3e71b678fb64..ea3a9df5f953 100644 --- a/drivers/scsi/3w-xxxx.c +++ b/drivers/scsi/3w-xxxx.c @@ -148,6 +148,11 @@ Fix bug in raw command post with data ioctl method. Fix bug where rollcall sometimes failed with cable errors. Print unit # on all command timeouts. + 1.02.00.026 - Fix possible infinite retry bug with power glitch induced + drive timeouts. + Cleanup some AEN severity levels. + 1.02.00.027 - Add drive not supported AEN code for SATA controllers. + Remove spurious unknown ioctl error message. */ #include <linux/module.h> @@ -201,7 +206,7 @@ static struct notifier_block tw_notifier = { }; /* Globals */ -char *tw_driver_version="1.02.00.025"; +char *tw_driver_version="1.02.00.027"; TW_Device_Extension *tw_device_extension_list[TW_MAX_SLOT]; int tw_device_extension_count = 0; @@ -212,7 +217,7 @@ int tw_aen_complete(TW_Device_Extension *tw_dev, int request_id) { TW_Param *param; unsigned short aen; - int error = 0; + int error = 0, table_max = 0; dprintk(KERN_WARNING "3w-xxxx: tw_aen_complete()\n"); if (tw_dev->alignment_virtual_address[request_id] == NULL) { @@ -227,7 +232,8 @@ int tw_aen_complete(TW_Device_Extension *tw_dev, int request_id) if (aen == 0x0ff) { printk(KERN_WARNING "3w-xxxx: scsi%d: AEN: INFO: AEN queue overflow.\n", tw_dev->host->host_no); } else { - if ((aen & 0x0ff) < TW_AEN_STRING_MAX) { + table_max = sizeof(tw_aen_string)/sizeof(char *); + if ((aen & 0x0ff) < table_max) { if ((tw_aen_string[aen & 0xff][strlen(tw_aen_string[aen & 0xff])-1]) == '#') { printk(KERN_WARNING "3w-xxxx: scsi%d: AEN: %s%d.\n", tw_dev->host->host_no, tw_aen_string[aen & 0xff], aen >> 8); } else { @@ -289,7 +295,7 @@ int tw_aen_drain_queue(TW_Device_Extension *tw_dev) int first_reset = 0; int queue = 0; int imax, i; - int found = 0; + int found = 0, table_max = 0; dprintk(KERN_NOTICE "3w-xxxx: tw_aen_drain_queue()\n"); @@ -409,7 +415,8 @@ int tw_aen_drain_queue(TW_Device_Extension *tw_dev) if (aen == 0x0ff) { printk(KERN_WARNING "3w-xxxx: AEN: INFO: AEN queue overflow.\n"); } else { - if ((aen & 0x0ff) < TW_AEN_STRING_MAX) { + table_max = sizeof(tw_aen_string)/sizeof(char *); + if ((aen & 0x0ff) < table_max) { if ((tw_aen_string[aen & 0xff][strlen(tw_aen_string[aen & 0xff])-1]) == '#') { printk(KERN_WARNING "3w-xxxx: AEN: %s%d.\n", tw_aen_string[aen & 0xff], aen >> 8); } else { @@ -1442,7 +1449,8 @@ static void tw_interrupt(int irq, void *dev_instance, struct pt_regs *regs) /* If error, command failed */ if (error == 1) { - tw_dev->srb[request_id]->result = (DID_RESET << 16); + /* Ask for a host reset */ + tw_dev->srb[request_id]->result = (DID_OK << 16) | (CHECK_CONDITION << 1); } /* Now complete the io */ @@ -1784,7 +1792,7 @@ int tw_ioctl(TW_Device_Extension *tw_dev, int request_id) return 1; } default: - printk(KERN_WARNING "3w-xxxx: Unknown ioctl 0x%x.\n", opcode); + dprintk(KERN_WARNING "3w-xxxx: Unknown ioctl 0x%x.\n", opcode); tw_dev->state[request_id] = TW_S_COMPLETED; tw_state_request_finish(tw_dev, request_id); tw_dev->srb[request_id]->result = (DID_OK << 16); diff --git a/drivers/scsi/3w-xxxx.h b/drivers/scsi/3w-xxxx.h index 3f96e3753da3..5a5d7f6a056f 100644 --- a/drivers/scsi/3w-xxxx.h +++ b/drivers/scsi/3w-xxxx.h @@ -90,14 +90,13 @@ static char *tw_aen_string[] = { "INFO: Verify started: Unit #", // 0x029 "ERROR: Verify failed: Port #", // 0x02A "INFO: Verify complete: Unit #", // 0x02B - "ERROR: Overwrote bad sector during rebuild: Port #", //0x02C - "ERROR: Encountered bad sector during rebuild: Port #", //0x02D - "INFO: Replacement drive is too small: Port #", //0x02E - "WARNING: Verify error: Unit not previously initialized: Unit #" //0x02F + "WARNING: Overwrote bad sector during rebuild: Port #", //0x02C + "ERROR: Encountered bad sector during rebuild: Port #", //0x02D + "ERROR: Replacement drive is too small: Port #", //0x02E + "WARNING: Verify error: Unit not previously initialized: Unit #", //0x02F + "ERROR: Drive not supported: Port #" // 0x030 }; -#define TW_AEN_STRING_MAX 0x030 - /* Sense key lookup table Format: ESDC/flags,SenseKey,AdditionalSenseCode,AdditionalSenseCodeQualifier diff --git a/fs/buffer.c b/fs/buffer.c index 0b9766099e3d..4f1c380230be 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -128,22 +128,18 @@ void unlock_buffer(struct buffer_head *bh) */ void __wait_on_buffer(struct buffer_head * bh) { - wait_queue_head_t *wq = bh_waitq_head(bh); - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); + wait_queue_head_t *wqh = bh_waitq_head(bh); + DEFINE_WAIT(wait); get_bh(bh); - add_wait_queue(wq, &wait); do { + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); blk_run_queues(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - if (!buffer_locked(bh)) - break; - schedule(); + if (buffer_locked(bh)) + schedule(); } while (buffer_locked(bh)); - tsk->state = TASK_RUNNING; - remove_wait_queue(wq, &wait); put_bh(bh); + finish_wait(wqh, &wait); } static inline void @@ -246,10 +242,12 @@ int fsync_bdev(struct block_device *bdev) } /* - * sync everything. + * sync everything. Start out by waking pdflush, because that writes back + * all queues in parallel. */ asmlinkage long sys_sync(void) { + wakeup_bdflush(0); sync_inodes(0); /* All mappings and inodes, including block devices */ DQUOT_SYNC(NULL); sync_supers(); /* Write the superblocks */ diff --git a/fs/dcache.c b/fs/dcache.c index ac127d32eed9..1715f006ccd4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -329,12 +329,11 @@ static inline void prune_one_dentry(struct dentry * dentry) void prune_dcache(int count) { spin_lock(&dcache_lock); - for (;;) { + for (; count ; count--) { struct dentry *dentry; struct list_head *tmp; tmp = dentry_unused.prev; - if (tmp == &dentry_unused) break; list_del_init(tmp); @@ -349,12 +348,8 @@ void prune_dcache(int count) dentry_stat.nr_unused--; /* Unused dentry with a count? */ - if (atomic_read(&dentry->d_count)) - BUG(); - + BUG_ON(atomic_read(&dentry->d_count)); prune_one_dentry(dentry); - if (!--count) - break; } spin_unlock(&dcache_lock); } @@ -573,19 +568,11 @@ void shrink_dcache_anon(struct list_head *head) /* * This is called from kswapd when we think we need some - * more memory, but aren't really sure how much. So we - * carefully try to free a _bit_ of our dcache, but not - * too much. - * - * Priority: - * 1 - very urgent: shrink everything - * ... - * 6 - base-level: try to shrink a bit. + * more memory. */ -int shrink_dcache_memory(int priority, unsigned int gfp_mask) +int shrink_dcache_memory(int ratio, unsigned int gfp_mask) { - int count = 0; - + int entries = dentry_stat.nr_dentry / ratio + 1; /* * Nasty deadlock avoidance. * @@ -600,11 +587,8 @@ int shrink_dcache_memory(int priority, unsigned int gfp_mask) if (!(gfp_mask & __GFP_FS)) return 0; - count = dentry_stat.nr_unused / priority; - - prune_dcache(count); - kmem_cache_shrink(dentry_cache); - return 0; + prune_dcache(entries); + return entries; } #define NAME_ALLOC_LEN(len) ((len+16) & ~15) diff --git a/fs/dquot.c b/fs/dquot.c index 58095d92cbee..3b1efaef018a 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -480,26 +480,17 @@ static void prune_dqcache(int count) /* * This is called from kswapd when we think we need some - * more memory, but aren't really sure how much. So we - * carefully try to free a _bit_ of our dqcache, but not - * too much. - * - * Priority: - * 1 - very urgent: shrink everything - * ... - * 6 - base-level: try to shrink a bit. + * more memory */ -int shrink_dqcache_memory(int priority, unsigned int gfp_mask) +int shrink_dqcache_memory(int ratio, unsigned int gfp_mask) { - int count = 0; + int entries = dqstats.allocated_dquots / ratio + 1; lock_kernel(); - count = dqstats.free_dquots / priority; - prune_dqcache(count); + prune_dqcache(entries); unlock_kernel(); - kmem_cache_shrink(dquot_cachep); - return 0; + return entries; } /* diff --git a/fs/inode.c b/fs/inode.c index 89c96e221043..c07e1e7e1a35 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -386,10 +386,11 @@ void prune_icache(int goal) count = 0; entry = inode_unused.prev; - while (entry != &inode_unused) - { + for(; goal; goal--) { struct list_head *tmp = entry; + if (entry == &inode_unused) + break; entry = entry->prev; inode = INODE(tmp); if (inode->i_state & (I_FREEING|I_CLEAR|I_LOCK)) @@ -403,8 +404,6 @@ void prune_icache(int goal) list_add(tmp, freeable); inode->i_state |= I_FREEING; count++; - if (!--goal) - break; } inodes_stat.nr_unused -= count; spin_unlock(&inode_lock); @@ -414,19 +413,11 @@ void prune_icache(int goal) /* * This is called from kswapd when we think we need some - * more memory, but aren't really sure how much. So we - * carefully try to free a _bit_ of our icache, but not - * too much. - * - * Priority: - * 1 - very urgent: shrink everything - * ... - * 6 - base-level: try to shrink a bit. + * more memory. */ -int shrink_icache_memory(int priority, int gfp_mask) +int shrink_icache_memory(int ratio, unsigned int gfp_mask) { - int count = 0; - + int entries = inodes_stat.nr_inodes / ratio + 1; /* * Nasty deadlock avoidance.. * @@ -437,12 +428,10 @@ int shrink_icache_memory(int priority, int gfp_mask) if (!(gfp_mask & __GFP_FS)) return 0; - count = inodes_stat.nr_unused / priority; - - prune_icache(count); - kmem_cache_shrink(inode_cachep); - return 0; + prune_icache(entries); + return entries; } +EXPORT_SYMBOL(shrink_icache_memory); /* * Called with the inode lock held. diff --git a/fs/locks.c b/fs/locks.c index 3702820a3de1..ab969a790fca 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -252,7 +252,7 @@ static int flock_make_lock(struct file *filp, return -ENOMEM; fl->fl_file = filp; - fl->fl_pid = current->pid; + fl->fl_pid = current->tgid; fl->fl_flags = (cmd & LOCK_NB) ? FL_FLOCK : FL_FLOCK | FL_SLEEP; fl->fl_type = type; fl->fl_end = OFFSET_MAX; @@ -308,7 +308,7 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl, fl->fl_end = OFFSET_MAX; fl->fl_owner = current->files; - fl->fl_pid = current->pid; + fl->fl_pid = current->tgid; fl->fl_file = filp; fl->fl_flags = FL_POSIX; fl->fl_notify = NULL; @@ -348,7 +348,7 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, fl->fl_end = OFFSET_MAX; fl->fl_owner = current->files; - fl->fl_pid = current->pid; + fl->fl_pid = current->tgid; fl->fl_file = filp; fl->fl_flags = FL_POSIX; fl->fl_notify = NULL; @@ -377,7 +377,7 @@ static int lease_alloc(struct file *filp, int type, struct file_lock **flp) return -ENOMEM; fl->fl_owner = current->files; - fl->fl_pid = current->pid; + fl->fl_pid = current->tgid; fl->fl_file = filp; fl->fl_flags = FL_LEASE; @@ -669,7 +669,7 @@ int locks_mandatory_area(int read_write, struct inode *inode, int error; fl.fl_owner = current->files; - fl.fl_pid = current->pid; + fl.fl_pid = current->tgid; fl.fl_file = filp; fl.fl_flags = FL_POSIX | FL_ACCESS | FL_SLEEP; fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK; @@ -1241,7 +1241,7 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg) *before = fl; list_add(&fl->fl_link, &file_lock_list); - error = f_setown(filp, current->pid, 1); + error = f_setown(filp, current->tgid, 1); out_unlock: unlock_kernel(); return error; @@ -1632,7 +1632,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) lock.fl_start = 0; lock.fl_end = OFFSET_MAX; lock.fl_owner = owner; - lock.fl_pid = current->pid; + lock.fl_pid = current->tgid; lock.fl_file = filp; if (filp->f_op && filp->f_op->lock != NULL) { diff --git a/include/asm-i386/io.h b/include/asm-i386/io.h index b2afa09c062f..41bc8ef0bdf8 100644 --- a/include/asm-i386/io.h +++ b/include/asm-i386/io.h @@ -40,7 +40,6 @@ #define XQUAD_PORTIO_BASE 0xfe400000 #define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ -#define XQUAD_PORTIO_LEN 0x80000 /* Only remapping first 2 quads */ #ifdef __KERNEL__ diff --git a/include/asm-i386/semaphore.h b/include/asm-i386/semaphore.h index a0ce1b8dba69..9c456727e8a3 100644 --- a/include/asm-i386/semaphore.h +++ b/include/asm-i386/semaphore.h @@ -116,7 +116,7 @@ static inline void down(struct semaphore * sem) #if WAITQUEUE_DEBUG CHECK_MAGIC(sem->__magic); #endif - + might_sleep(); __asm__ __volatile__( "# atomic down operation\n\t" LOCK "decl %0\n\t" /* --sem->count */ @@ -142,7 +142,7 @@ static inline int down_interruptible(struct semaphore * sem) #if WAITQUEUE_DEBUG CHECK_MAGIC(sem->__magic); #endif - + might_sleep(); __asm__ __volatile__( "# atomic interruptible down operation\n\t" LOCK "decl %1\n\t" /* --sem->count */ diff --git a/include/linux/dcache.h b/include/linux/dcache.h index f99a03f17e60..a64a657545fe 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -186,7 +186,7 @@ extern int shrink_dcache_memory(int, unsigned int); extern void prune_dcache(int); /* icache memory management (defined in linux/fs/inode.c) */ -extern int shrink_icache_memory(int, int); +extern int shrink_icache_memory(int, unsigned int); extern void prune_icache(int); /* quota cache memory management (defined in linux/fs/dquot.c) */ diff --git a/include/linux/elevator.h b/include/linux/elevator.h index e98168f92e67..c5cc69788530 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -52,12 +52,10 @@ extern inline struct list_head *elv_get_sort_head(request_queue_t *, struct requ extern elevator_t elevator_noop; /* - * elevator linus. based on linus ideas of starvation control, using - * sequencing to manage inserts and merges. + * deadline i/o scheduler. uses request time outs to prevent indefinite + * starvation */ -extern elevator_t elevator_linus; -#define elv_linus_sequence(rq) ((long)(rq)->elevator_private) -#define ELV_LINUS_SEEK_COST 16 +extern elevator_t iosched_deadline; /* * use the /proc/iosched interface, all the below is history -> diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5efa540d55f8..44c38b134498 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -40,6 +40,13 @@ struct completion; +#ifdef CONFIG_DEBUG_KERNEL +void __might_sleep(char *file, int line); +#define might_sleep() __might_sleep(__FILE__, __LINE__) +#else +#define might_sleep() do {} while(0) +#endif + extern struct notifier_block *panic_notifier_list; NORET_TYPE void panic(const char * fmt, ...) __attribute__ ((NORET_AND format (printf, 1, 2))); diff --git a/include/linux/mm.h b/include/linux/mm.h index c63e4947387f..482db998aca7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -524,6 +524,7 @@ extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned lon extern struct page * vmalloc_to_page(void *addr); extern unsigned long get_page_cache_size(void); +extern unsigned int nr_used_zone_pages(void); #endif /* __KERNEL__ */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 43390b2e2ef4..bfc986131fe6 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -74,9 +74,15 @@ static inline void ___add_to_page_cache(struct page *page, inc_page_state(nr_pagecache); } -extern void FASTCALL(lock_page(struct page *page)); +extern void FASTCALL(__lock_page(struct page *page)); extern void FASTCALL(unlock_page(struct page *page)); +static inline void lock_page(struct page *page) +{ + if (TestSetPageLocked(page)) + __lock_page(page); +} + /* * This is exported only for wait_on_page_locked/wait_on_page_writeback. * Never use this directly! diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 4a7e2bb0d7c4..bfb988885002 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -40,6 +40,7 @@ extern void FASTCALL(rwsemtrace(struct rw_semaphore *sem, const char *str)); */ static inline void down_read(struct rw_semaphore *sem) { + might_sleep(); rwsemtrace(sem,"Entering down_read"); __down_read(sem); rwsemtrace(sem,"Leaving down_read"); @@ -62,6 +63,7 @@ static inline int down_read_trylock(struct rw_semaphore *sem) */ static inline void down_write(struct rw_semaphore *sem) { + might_sleep(); rwsemtrace(sem,"Entering down_write"); __down_write(sem); rwsemtrace(sem,"Leaving down_write"); diff --git a/include/linux/sched.h b/include/linux/sched.h index f1346010d73e..471dcb9c108d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -100,8 +100,9 @@ extern unsigned long nr_uninterruptible(void); #define TASK_RUNNING 0 #define TASK_INTERRUPTIBLE 1 #define TASK_UNINTERRUPTIBLE 2 -#define TASK_ZOMBIE 4 -#define TASK_STOPPED 8 +#define TASK_STOPPED 4 +#define TASK_ZOMBIE 8 +#define TASK_DEAD 16 #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) diff --git a/include/linux/wait.h b/include/linux/wait.h index 8664b02f230d..b6ce459f8792 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -119,6 +119,32 @@ static inline void __remove_wait_queue(wait_queue_head_t *head, _raced; \ }) +/* + * Waitqueue's which are removed from the waitqueue_head at wakeup time + */ +void FASTCALL(prepare_to_wait(wait_queue_head_t *q, + wait_queue_t *wait, int state)); +void FASTCALL(prepare_to_wait_exclusive(wait_queue_head_t *q, + wait_queue_t *wait, int state)); +void FASTCALL(finish_wait(wait_queue_head_t *q, wait_queue_t *wait)); +int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync); + +#define DEFINE_WAIT(name) \ + wait_queue_t name = { \ + .task = current, \ + .func = autoremove_wake_function, \ + .task_list = { .next = &name.task_list, \ + .prev = &name.task_list, \ + }, \ + } + +#define init_wait(wait) \ + do { \ + wait->task = current; \ + wait->func = autoremove_wake_function; \ + INIT_LIST_HEAD(&wait->task_list); \ + } while (0) + #endif /* __KERNEL__ */ #endif diff --git a/kernel/exit.c b/kernel/exit.c index 7189e9bce6d4..6ed07def4c62 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -32,6 +32,7 @@ int getrusage(struct task_struct *, int, struct rusage *); static struct dentry * __unhash_process(struct task_struct *p) { struct dentry *proc_dentry; + nr_threads--; detach_pid(p, PIDTYPE_PID); detach_pid(p, PIDTYPE_TGID); @@ -57,31 +58,31 @@ static struct dentry * __unhash_process(struct task_struct *p) void release_task(struct task_struct * p) { struct dentry *proc_dentry; + task_t *leader; - if (p->state != TASK_ZOMBIE) + if (p->state < TASK_ZOMBIE) BUG(); if (p != current) wait_task_inactive(p); atomic_dec(&p->user->processes); security_ops->task_free_security(p); free_uid(p->user); - if (unlikely(p->ptrace)) { - write_lock_irq(&tasklist_lock); + write_lock_irq(&tasklist_lock); + if (unlikely(p->ptrace)) __ptrace_unlink(p); - write_unlock_irq(&tasklist_lock); - } BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); - write_lock_irq(&tasklist_lock); __exit_sighand(p); proc_dentry = __unhash_process(p); /* * If we are the last non-leader member of the thread * group, and the leader is zombie, then notify the - * group leader's parent process. + * group leader's parent process. (if it wants notification.) */ - if (p->group_leader != p && thread_group_empty(p)) - do_notify_parent(p->group_leader, p->group_leader->exit_signal); + leader = p->group_leader; + if (leader != p && thread_group_empty(leader) && + leader->state == TASK_ZOMBIE && leader->exit_signal != -1) + do_notify_parent(leader, leader->exit_signal); p->parent->cutime += p->utime + p->cutime; p->parent->cstime += p->stime + p->cstime; @@ -159,7 +160,7 @@ static int __will_become_orphaned_pgrp(int pgrp, task_t *ignored_task) for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) { if (p == ignored_task - || p->state == TASK_ZOMBIE + || p->state >= TASK_ZOMBIE || p->real_parent->pid == 1) continue; if (p->real_parent->pgrp != pgrp @@ -435,8 +436,11 @@ void exit_mm(struct task_struct *tsk) static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper) { - /* Make sure we're not reparenting to ourselves. */ - if (p == reaper) + /* + * Make sure we're not reparenting to ourselves and that + * the parent is not a zombie. + */ + if (p == reaper || reaper->state >= TASK_ZOMBIE) p->real_parent = child_reaper; else p->real_parent = reaper; @@ -774,9 +778,10 @@ static int eligible_child(pid_t pid, int options, task_t *p) asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru) { - int flag, retval; DECLARE_WAITQUEUE(wait, current); struct task_struct *tsk; + unsigned long state; + int flag, retval; if (options & ~(WNOHANG|WUNTRACED|__WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; @@ -827,7 +832,15 @@ repeat: */ if (ret == 2) continue; + /* + * Try to move the task's state to DEAD + * only one thread is allowed to do this: + */ + state = xchg(&p->state, TASK_DEAD); + if (state != TASK_ZOMBIE) + continue; read_unlock(&tasklist_lock); + retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; if (!retval && stat_addr) { if (p->sig->group_exit) @@ -835,13 +848,16 @@ repeat: else retval = put_user(p->exit_code, stat_addr); } - if (retval) - goto end_wait4; + if (retval) { + p->state = TASK_ZOMBIE; + goto end_wait4; + } retval = p->pid; if (p->real_parent != p->parent) { write_lock_irq(&tasklist_lock); __ptrace_unlink(p); do_notify_parent(p, SIGCHLD); + p->state = TASK_ZOMBIE; write_unlock_irq(&tasklist_lock); } else release_task(p); diff --git a/kernel/fork.c b/kernel/fork.c index 062a4d1f9c3e..5880309f3fee 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -103,6 +103,52 @@ void remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait) spin_unlock_irqrestore(&q->lock, flags); } +void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + __set_current_state(state); + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} + +void +prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + __set_current_state(state); + wait->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue_tail(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} + +void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + if (!list_empty(&wait->task_list)) { + spin_lock_irqsave(&q->lock, flags); + list_del_init(&wait->task_list); + spin_unlock_irqrestore(&q->lock, flags); + } +} + +int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync) +{ + int ret = default_wake_function(wait, mode, sync); + + if (ret) + list_del_init(&wait->task_list); + return ret; +} + void __init fork_init(unsigned long mempages) { /* create a slab on which task_structs can be allocated */ diff --git a/kernel/ksyms.c b/kernel/ksyms.c index 557ae8f7ded2..0409fc676f29 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -400,6 +400,10 @@ EXPORT_SYMBOL(irq_stat); EXPORT_SYMBOL(add_wait_queue); EXPORT_SYMBOL(add_wait_queue_exclusive); EXPORT_SYMBOL(remove_wait_queue); +EXPORT_SYMBOL(prepare_to_wait); +EXPORT_SYMBOL(prepare_to_wait_exclusive); +EXPORT_SYMBOL(finish_wait); +EXPORT_SYMBOL(autoremove_wake_function); /* completion handling */ EXPORT_SYMBOL(wait_for_completion); @@ -493,7 +497,9 @@ EXPORT_SYMBOL(jiffies_64); EXPORT_SYMBOL(xtime); EXPORT_SYMBOL(do_gettimeofday); EXPORT_SYMBOL(do_settimeofday); - +#ifdef CONFIG_DEBUG_KERNEL +EXPORT_SYMBOL(__might_sleep); +#endif #if !defined(__ia64__) EXPORT_SYMBOL(loops_per_jiffy); #endif diff --git a/kernel/pid.c b/kernel/pid.c index b4da62f0aef2..0005a8cc36cb 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -53,6 +53,8 @@ static pidmap_t pidmap_array[PIDMAP_ENTRIES] = static pidmap_t *map_limit = pidmap_array + PIDMAP_ENTRIES; +static spinlock_t pidmap_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; + inline void free_pidmap(int pid) { pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; @@ -77,8 +79,13 @@ static inline pidmap_t *next_free_map(pidmap_t *map, int *max_steps) * Free the page if someone raced with us * installing it: */ - if (cmpxchg(&map->page, NULL, (void *) page)) + spin_lock(&pidmap_lock); + if (map->page) free_page(page); + else + map->page = (void *)page; + spin_unlock(&pidmap_lock); + if (!map->page) break; } diff --git a/kernel/sched.c b/kernel/sched.c index 304f90fd4bdf..9965e5f7549e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2150,3 +2150,20 @@ void __init sched_init(void) enter_lazy_tlb(&init_mm, current, smp_processor_id()); } +#ifdef CONFIG_DEBUG_KERNEL +void __might_sleep(char *file, int line) +{ +#if defined(in_atomic) + static unsigned long prev_jiffy; /* ratelimiting */ + + if (in_atomic()) { + if (time_before(jiffies, prev_jiffy + HZ)) + return; + prev_jiffy = jiffies; + printk("Sleeping function called from illegal" + " context at %s:%d\n", file, line); + dump_stack(); + } +#endif +} +#endif diff --git a/kernel/timer.c b/kernel/timer.c index 3b4be840f931..55c14c11c901 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -888,20 +888,6 @@ asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp) if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 || t.tv_sec < 0) return -EINVAL; - - if (t.tv_sec == 0 && t.tv_nsec <= 2000000L && - current->policy != SCHED_NORMAL) - { - /* - * Short delay requests up to 2 ms will be handled with - * high precision by a busy wait for all real-time processes. - * - * Its important on SMP not to do this holding locks. - */ - udelay((t.tv_nsec + 999) / 1000); - return 0; - } - expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); current->state = TASK_INTERRUPTIBLE; diff --git a/mm/filemap.c b/mm/filemap.c index 9118a5794f27..f45168a04974 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -632,19 +632,15 @@ static inline wait_queue_head_t *page_waitqueue(struct page *page) void wait_on_page_bit(struct page *page, int bit_nr) { wait_queue_head_t *waitqueue = page_waitqueue(page); - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); + DEFINE_WAIT(wait); - add_wait_queue(waitqueue, &wait); do { - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - if (!test_bit(bit_nr, &page->flags)) - break; + prepare_to_wait(waitqueue, &wait, TASK_UNINTERRUPTIBLE); sync_page(page); - schedule(); + if (test_bit(bit_nr, &page->flags)) + schedule(); } while (test_bit(bit_nr, &page->flags)); - __set_task_state(tsk, TASK_RUNNING); - remove_wait_queue(waitqueue, &wait); + finish_wait(waitqueue, &wait); } EXPORT_SYMBOL(wait_on_page_bit); @@ -690,38 +686,27 @@ void end_page_writeback(struct page *page) EXPORT_SYMBOL(end_page_writeback); /* - * Get a lock on the page, assuming we need to sleep - * to get it.. + * Get a lock on the page, assuming we need to sleep to get it. + * + * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some + * random driver's requestfn sets TASK_RUNNING, we could busywait. However + * chances are that on the second loop, the block layer's plug list is empty, + * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. */ -static void __lock_page(struct page *page) +void __lock_page(struct page *page) { - wait_queue_head_t *waitqueue = page_waitqueue(page); - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); + wait_queue_head_t *wqh = page_waitqueue(page); + DEFINE_WAIT(wait); - add_wait_queue_exclusive(waitqueue, &wait); - for (;;) { - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - if (PageLocked(page)) { - sync_page(page); + while (TestSetPageLocked(page)) { + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + sync_page(page); + if (PageLocked(page)) schedule(); - } - if (!TestSetPageLocked(page)) - break; } - __set_task_state(tsk, TASK_RUNNING); - remove_wait_queue(waitqueue, &wait); -} - -/* - * Get an exclusive lock on the page, optimistically - * assuming it's not locked.. - */ -void lock_page(struct page *page) -{ - if (TestSetPageLocked(page)) - __lock_page(page); + finish_wait(wqh, &wait); } +EXPORT_SYMBOL(__lock_page); /* * a rather lightweight function, finding and getting a reference to a diff --git a/mm/mprotect.c b/mm/mprotect.c index fc1e3345d38b..be0096238437 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -187,7 +187,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, * Try to merge with the previous vma. */ if (mprotect_attempt_merge(vma, *pprev, end, newflags)) - return 0; + goto success; } else { error = split_vma(mm, vma, start, 1); if (error) @@ -209,7 +209,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, vma->vm_flags = newflags; vma->vm_page_prot = newprot; spin_unlock(&mm->page_table_lock); - +success: change_protection(vma, start, end, newprot); return 0; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 435a12dd1574..ab3284a3b78a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -321,6 +321,9 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, struct page * page; int freed, i; + if (gfp_mask & __GFP_WAIT) + might_sleep(); + KERNEL_STAT_ADD(pgalloc, 1<<order); zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ @@ -479,6 +482,17 @@ unsigned int nr_free_pages(void) return sum; } +unsigned int nr_used_zone_pages(void) +{ + unsigned int pages = 0; + struct zone *zone; + + for_each_zone(zone) + pages += zone->nr_active + zone->nr_inactive; + + return pages; +} + static unsigned int nr_free_zone_pages(int offset) { pg_data_t *pgdat; diff --git a/mm/pdflush.c b/mm/pdflush.c index d5b5841ef0d9..7c31ae0446b4 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c @@ -79,9 +79,9 @@ static unsigned long last_empty_jifs; */ struct pdflush_work { struct task_struct *who; /* The thread */ - void (*fn)(unsigned long); /* A callback function for pdflush to work on */ - unsigned long arg0; /* An argument to the callback function */ - struct list_head list; /* On pdflush_list, when the thread is idle */ + void (*fn)(unsigned long); /* A callback function */ + unsigned long arg0; /* An argument to the callback */ + struct list_head list; /* On pdflush_list, when idle */ unsigned long when_i_went_to_sleep; }; @@ -99,24 +99,35 @@ static int __pdflush(struct pdflush_work *my_work) current->flags |= PF_FLUSHER; my_work->fn = NULL; my_work->who = current; + INIT_LIST_HEAD(&my_work->list); spin_lock_irq(&pdflush_lock); nr_pdflush_threads++; -// printk("pdflush %d [%d] starts\n", nr_pdflush_threads, current->pid); for ( ; ; ) { struct pdflush_work *pdf; - list_add(&my_work->list, &pdflush_list); - my_work->when_i_went_to_sleep = jiffies; set_current_state(TASK_INTERRUPTIBLE); + list_move(&my_work->list, &pdflush_list); + my_work->when_i_went_to_sleep = jiffies; spin_unlock_irq(&pdflush_lock); if (current->flags & PF_FREEZE) refrigerator(PF_IOTHREAD); schedule(); - if (my_work->fn) - (*my_work->fn)(my_work->arg0); + spin_lock_irq(&pdflush_lock); + if (!list_empty(&my_work->list)) { + printk("pdflush: bogus wakeup!\n"); + my_work->fn = NULL; + continue; + } + if (my_work->fn == NULL) { + printk("pdflush: NULL work function\n"); + continue; + } + spin_unlock_irq(&pdflush_lock); + + (*my_work->fn)(my_work->arg0); /* * Thread creation: For how long have there been zero @@ -132,6 +143,7 @@ static int __pdflush(struct pdflush_work *my_work) } spin_lock_irq(&pdflush_lock); + my_work->fn = NULL; /* * Thread destruction: For how long has the sleepiest @@ -143,13 +155,12 @@ static int __pdflush(struct pdflush_work *my_work) continue; pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) { - pdf->when_i_went_to_sleep = jiffies; /* Limit exit rate */ + /* Limit exit rate */ + pdf->when_i_went_to_sleep = jiffies; break; /* exeunt */ } - my_work->fn = NULL; } nr_pdflush_threads--; -// printk("pdflush %d [%d] ends\n", nr_pdflush_threads, current->pid); spin_unlock_irq(&pdflush_lock); return 0; } @@ -191,11 +202,10 @@ int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0) list_del_init(&pdf->list); if (list_empty(&pdflush_list)) last_empty_jifs = jiffies; - spin_unlock_irqrestore(&pdflush_lock, flags); pdf->fn = fn; pdf->arg0 = arg0; - wmb(); /* ? */ wake_up_process(pdf->who); + spin_unlock_irqrestore(&pdflush_lock, flags); } return ret; } diff --git a/mm/slab.c b/mm/slab.c index 549cd2f465ea..a6bd0a98734b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1370,6 +1370,9 @@ static inline void * __kmem_cache_alloc (kmem_cache_t *cachep, int flags) unsigned long save_flags; void* objp; + if (flags & __GFP_WAIT) + might_sleep(); + kmem_cache_alloc_head(cachep, flags); try_again: local_irq_save(save_flags); @@ -1496,7 +1499,11 @@ static inline void kmem_cache_free_one(kmem_cache_t *cachep, void *objp) if (unlikely(!--slabp->inuse)) { /* Was partial or full, now empty. */ list_del(&slabp->list); - list_add(&slabp->list, &cachep->slabs_free); +/* list_add(&slabp->list, &cachep->slabs_free); */ + if (unlikely(list_empty(&cachep->slabs_partial))) + list_add(&slabp->list, &cachep->slabs_partial); + else + kmem_slab_destroy(cachep, slabp); } else if (unlikely(inuse == cachep->num)) { /* Was full. */ list_del(&slabp->list); @@ -1970,7 +1977,7 @@ static int s_show(struct seq_file *m, void *p) } list_for_each(q,&cachep->slabs_partial) { slabp = list_entry(q, slab_t, list); - if (slabp->inuse == cachep->num || !slabp->inuse) + if (slabp->inuse == cachep->num) BUG(); active_objs += slabp->inuse; active_slabs++; diff --git a/mm/vmscan.c b/mm/vmscan.c index 5eade9423f0d..4302f698a7a4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -70,6 +70,10 @@ #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) #endif +#ifndef CONFIG_QUOTA +#define shrink_dqcache_memory(ratio, gfp_mask) do { } while (0) +#endif + /* Must be called with page's pte_chain_lock held. */ static inline int page_mapping_inuse(struct page * page) { @@ -97,7 +101,7 @@ static inline int is_page_cache_freeable(struct page *page) static /* inline */ int shrink_list(struct list_head *page_list, int nr_pages, - unsigned int gfp_mask, int *max_scan) + unsigned int gfp_mask, int *max_scan, int *nr_mapped) { struct address_space *mapping; LIST_HEAD(ret_pages); @@ -116,6 +120,10 @@ shrink_list(struct list_head *page_list, int nr_pages, if (TestSetPageLocked(page)) goto keep; + /* Double the slab pressure for mapped and swapcache pages */ + if (page_mapped(page) || PageSwapCache(page)) + (*nr_mapped)++; + BUG_ON(PageActive(page)); may_enter_fs = (gfp_mask & __GFP_FS) || (PageSwapCache(page) && (gfp_mask & __GFP_IO)); @@ -320,7 +328,7 @@ keep: */ static /* inline */ int shrink_cache(int nr_pages, struct zone *zone, - unsigned int gfp_mask, int max_scan) + unsigned int gfp_mask, int max_scan, int *nr_mapped) { LIST_HEAD(page_list); struct pagevec pvec; @@ -371,7 +379,8 @@ shrink_cache(int nr_pages, struct zone *zone, max_scan -= nr_scan; KERNEL_STAT_ADD(pgscan, nr_scan); - nr_pages = shrink_list(&page_list,nr_pages,gfp_mask,&max_scan); + nr_pages = shrink_list(&page_list, nr_pages, + gfp_mask, &max_scan, nr_mapped); if (nr_pages <= 0 && list_empty(&page_list)) goto done; @@ -522,14 +531,10 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in) static /* inline */ int shrink_zone(struct zone *zone, int max_scan, - unsigned int gfp_mask, int nr_pages) + unsigned int gfp_mask, int nr_pages, int *nr_mapped) { unsigned long ratio; - /* This is bogus for ZONE_HIGHMEM? */ - if (kmem_cache_reap(gfp_mask) >= nr_pages) - return 0; - /* * Try to keep the active list 2/3 of the size of the cache. And * make sure that refill_inactive is given a decent number of pages. @@ -547,7 +552,8 @@ shrink_zone(struct zone *zone, int max_scan, atomic_sub(SWAP_CLUSTER_MAX, &zone->refill_counter); refill_inactive_zone(zone, SWAP_CLUSTER_MAX); } - nr_pages = shrink_cache(nr_pages, zone, gfp_mask, max_scan); + nr_pages = shrink_cache(nr_pages, zone, gfp_mask, + max_scan, nr_mapped); return nr_pages; } @@ -557,6 +563,9 @@ shrink_caches(struct zone *classzone, int priority, { struct zone *first_classzone; struct zone *zone; + int ratio; + int nr_mapped = 0; + int pages = nr_used_zone_pages(); first_classzone = classzone->zone_pgdat->node_zones; for (zone = classzone; zone >= first_classzone; zone--) { @@ -581,16 +590,28 @@ shrink_caches(struct zone *classzone, int priority, max_scan = zone->nr_inactive >> priority; if (max_scan < to_reclaim * 2) max_scan = to_reclaim * 2; - unreclaimed = shrink_zone(zone, max_scan, gfp_mask, to_reclaim); + unreclaimed = shrink_zone(zone, max_scan, + gfp_mask, to_reclaim, &nr_mapped); nr_pages -= to_reclaim - unreclaimed; *total_scanned += max_scan; } - shrink_dcache_memory(priority, gfp_mask); - shrink_icache_memory(1, gfp_mask); -#ifdef CONFIG_QUOTA - shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); -#endif + /* + * Here we assume it costs one seek to replace a lru page and that + * it also takes a seek to recreate a cache object. With this in + * mind we age equal percentages of the lru and ageable caches. + * This should balance the seeks generated by these structures. + * + * NOTE: for now I do this for all zones. If we find this is too + * aggressive on large boxes we may want to exclude ZONE_HIGHMEM + * + * If we're encountering mapped pages on the LRU then increase the + * pressure on slab to avoid swapping. + */ + ratio = (pages / (*total_scanned + nr_mapped + 1)) + 1; + shrink_dcache_memory(ratio, gfp_mask); + shrink_icache_memory(ratio, gfp_mask); + shrink_dqcache_memory(ratio, gfp_mask); return nr_pages; } |
