From 3abbd8ff39f3da75117a35ac50020818ff3ef7a6 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 4 Jul 2003 19:36:03 -0700 Subject: [PATCH] bring back the batch_requests function From: Nick Piggin The batch_requests function got lost during the merge of the dynamic request allocation patch. We need it for the anticipatory scheduler - when the number of threads exceeds the number of requests, the anticipated-upon task will undesirably sleep in get_request_wait(). And apparently some block devices which use small requests need it so they string a decent number together. Jens has acked this patch. --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 786ea3563752..621a5b042a9c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -27,6 +27,7 @@ struct request_pm_state; struct request_list { int count[2]; mempool_t *rq_pool; + wait_queue_head_t wait[2]; }; /* -- cgit v1.2.3 From 33c664854c9c467f4c30fe038c2afa12cc126311 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 4 Jul 2003 19:36:09 -0700 Subject: [PATCH] Create `kblockd' workqueue keventd is inappropriate for running block request queues because keventd itself can get blocked on disk I/O. Via call_usermodehelper()'s vfork and, presumably, GFP_KERNEL allocations. So create a new gang of kernel threads whose mandate is for running low-level disk operations. It must ever block on disk IO, so any memory allocations should be GFP_NOIO. We mainly use it for running unplug operations from interrupt context. --- drivers/block/Makefile | 5 +++++ drivers/block/ll_rw_blk.c | 24 ++++++++++++++++++++++-- include/linux/blkdev.h | 4 ++++ 3 files changed, 31 insertions(+), 2 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/drivers/block/Makefile b/drivers/block/Makefile index c723e8ecc584..67c567bc9308 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -8,6 +8,11 @@ # In the future, some of these should be built conditionally. # +# +# NOTE that ll_rw_blk.c must come early in linkage order - it starts the +# kblockd threads +# + obj-y := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o deadline-iosched.o obj-$(CONFIG_MAC_FLOPPY) += swim3.o diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index dfd489ea0234..3e68ceb9578c 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -48,9 +48,15 @@ static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; */ static int queue_nr_requests; -unsigned long blk_max_low_pfn, blk_max_pfn; static wait_queue_head_t congestion_wqh[2]; +/* + * Controlling structure to kblockd + */ +static struct workqueue_struct *kblockd_workqueue; + +unsigned long blk_max_low_pfn, blk_max_pfn; + static inline int batch_requests(void) { return min(BLKDEV_MAX_RQ / 8, 8); @@ -2308,10 +2314,24 @@ void blk_rq_prep_restart(struct request *rq) rq->current_nr_sectors = rq->hard_cur_sectors; } +int kblockd_schedule_work(struct work_struct *work) +{ + return queue_work(kblockd_workqueue, work); +} + +void kblockd_flush(void) +{ + flush_workqueue(kblockd_workqueue); +} + int __init blk_dev_init(void) { int i; + kblockd_workqueue = create_workqueue("kblockd"); + if (!kblockd_workqueue) + panic("Failed to create kblockd\n"); + request_cachep = kmem_cache_create("blkdev_requests", sizeof(struct request), 0, 0, NULL, NULL); if (!request_cachep) @@ -2331,7 +2351,7 @@ int __init blk_dev_init(void) for (i = 0; i < ARRAY_SIZE(congestion_wqh); i++) init_waitqueue_head(&congestion_wqh[i]); return 0; -}; +} EXPORT_SYMBOL(process_that_request_first); EXPORT_SYMBOL(end_that_request_first); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 621a5b042a9c..e97790517973 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -561,6 +561,10 @@ static inline void put_dev_sector(Sector p) page_cache_release(p.v); } +struct work_struct; +int kblockd_schedule_work(struct work_struct *work); +void kblockd_flush(void); + #ifdef CONFIG_LBD # include # define sector_div(a, b) do_div(a, b) -- cgit v1.2.3 From ee66147bf85b90df796737381e057155b4bc4fe9 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 4 Jul 2003 19:36:44 -0700 Subject: [PATCH] per queue nr_requests From: Nick Piggin This gets rid of the global queue_nr_requests and usage of BLKDEV_MAX_RQ (the latter is now only used to set the queues' defaults). The queue depth becomes per-queue, controlled by a sysfs entry. --- drivers/block/elevator.c | 14 +--- drivers/block/genhd.c | 4 +- drivers/block/ll_rw_blk.c | 205 ++++++++++++++++++++++++++++++++++++++-------- include/linux/blkdev.h | 11 ++- include/linux/elevator.h | 4 +- 5 files changed, 188 insertions(+), 50 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c index 406755724e03..89af76783943 100644 --- a/drivers/block/elevator.c +++ b/drivers/block/elevator.c @@ -379,17 +379,13 @@ void elv_completed_request(request_queue_t *q, struct request *rq) e->elevator_completed_req_fn(q, rq); } -int elv_register_queue(struct gendisk *disk) +int elv_register_queue(struct request_queue *q) { - request_queue_t *q = disk->queue; elevator_t *e; - if (!q) - return -ENXIO; - e = &q->elevator; - e->kobj.parent = kobject_get(&disk->kobj); + e->kobj.parent = kobject_get(&q->kobj); if (!e->kobj.parent) return -EBUSY; @@ -399,14 +395,12 @@ int elv_register_queue(struct gendisk *disk) return kobject_register(&e->kobj); } -void elv_unregister_queue(struct gendisk *disk) +void elv_unregister_queue(struct request_queue *q) { - request_queue_t *q = disk->queue; - if (q) { elevator_t * e = &q->elevator; kobject_unregister(&e->kobj); - kobject_put(&disk->kobj); + kobject_put(&q->kobj); } } diff --git a/drivers/block/genhd.c b/drivers/block/genhd.c index 889b8753c29f..361aee8ab255 100644 --- a/drivers/block/genhd.c +++ b/drivers/block/genhd.c @@ -191,7 +191,7 @@ void add_disk(struct gendisk *disk) blk_register_region(MKDEV(disk->major, disk->first_minor), disk->minors, NULL, exact_match, exact_lock, disk); register_disk(disk); - elv_register_queue(disk); + blk_register_queue(disk); } EXPORT_SYMBOL(add_disk); @@ -199,7 +199,7 @@ EXPORT_SYMBOL(del_gendisk); /* in partitions/check.c */ void unlink_gendisk(struct gendisk *disk) { - elv_unregister_queue(disk); + blk_unregister_queue(disk); blk_unregister_region(MKDEV(disk->major, disk->first_minor), disk->minors); } diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index cdfe7d3697bc..b1248e542e5e 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -42,12 +42,6 @@ static kmem_cache_t *request_cachep; static LIST_HEAD(blk_plug_list); static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; -/* - * Number of requests per queue. This many for reads and for writes (twice - * this number, total). - */ -static int queue_nr_requests; - static wait_queue_head_t congestion_wqh[2]; /* @@ -57,9 +51,9 @@ static struct workqueue_struct *kblockd_workqueue; unsigned long blk_max_low_pfn, blk_max_pfn; -static inline int batch_requests(void) +static inline int batch_requests(struct request_queue *q) { - return min(BLKDEV_MAX_RQ / 8, 8); + return min(q->nr_requests / 8, 8UL); } /* @@ -67,11 +61,11 @@ static inline int batch_requests(void) * considered to be congested. It include a little hysteresis to keep the * context switch rate down. */ -static inline int queue_congestion_on_threshold(void) +static inline int queue_congestion_on_threshold(struct request_queue *q) { int ret; - ret = queue_nr_requests / 8 - 1; + ret = q->nr_requests / 8 - 1; if (ret < 0) ret = 1; return ret; @@ -80,13 +74,13 @@ static inline int queue_congestion_on_threshold(void) /* * The threshold at which a queue is considered to be uncongested */ -static inline int queue_congestion_off_threshold(void) +static inline int queue_congestion_off_threshold(struct request_queue *q) { int ret; - ret = queue_nr_requests / 8 + 1; - if (ret > queue_nr_requests) - ret = queue_nr_requests; + ret = q->nr_requests / 8 + 1; + if (ret > q->nr_requests) + ret = q->nr_requests; return ret; } @@ -199,6 +193,7 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn) /* * set defaults */ + q->nr_requests = BLKDEV_MAX_RQ; q->max_phys_segments = MAX_PHYS_SEGMENTS; q->max_hw_segments = MAX_HW_SEGMENTS; q->make_request_fn = mfn; @@ -452,13 +447,15 @@ void blk_queue_free_tags(request_queue_t *q) q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED); } -static int init_tag_map(struct blk_queue_tag *tags, int depth) +static int +init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth) { int bits, i; - if (depth > (queue_nr_requests*2)) { - depth = (queue_nr_requests*2); - printk(KERN_ERR "%s: adjusted depth to %d\n", __FUNCTION__, depth); + if (depth > q->nr_requests * 2) { + depth = q->nr_requests * 2; + printk(KERN_ERR "%s: adjusted depth to %d\n", + __FUNCTION__, depth); } tags->tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC); @@ -487,7 +484,6 @@ fail: return -ENOMEM; } - /** * blk_queue_init_tags - initialize the queue tag info * @q: the request queue for the device @@ -501,7 +497,7 @@ int blk_queue_init_tags(request_queue_t *q, int depth) if (!tags) goto fail; - if (init_tag_map(tags, depth)) + if (init_tag_map(q, tags, depth)) goto fail; INIT_LIST_HEAD(&tags->busy_list); @@ -551,7 +547,7 @@ int blk_queue_resize_tags(request_queue_t *q, int new_depth) tag_map = bqt->tag_map; max_depth = bqt->real_max_depth; - if (init_tag_map(bqt, new_depth)) + if (init_tag_map(q, bqt, new_depth)) return -ENOMEM; memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *)); @@ -1315,12 +1311,12 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) struct request_list *rl = &q->rq; spin_lock_irq(q->queue_lock); - if (rl->count[rw] == BLKDEV_MAX_RQ || !elv_may_queue(q, rw)) { + if (rl->count[rw] >= q->nr_requests || !elv_may_queue(q, rw)) { spin_unlock_irq(q->queue_lock); goto out; } rl->count[rw]++; - if ((BLKDEV_MAX_RQ - rl->count[rw]) < queue_congestion_on_threshold()) + if ((q->nr_requests - rl->count[rw]) < queue_congestion_on_threshold(q)) set_queue_congested(q, rw); spin_unlock_irq(q->queue_lock); @@ -1328,7 +1324,7 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) if (!rq) { spin_lock_irq(q->queue_lock); rl->count[rw]--; - if ((BLKDEV_MAX_RQ - rl->count[rw]) >= queue_congestion_off_threshold()) + if ((q->nr_requests - rl->count[rw]) >= queue_congestion_off_threshold(q)) clear_queue_congested(q, rw); spin_unlock_irq(q->queue_lock); goto out; @@ -1549,10 +1545,10 @@ void __blk_put_request(request_queue_t *q, struct request *req) blk_free_request(q, req); rl->count[rw]--; - if ((BLKDEV_MAX_RQ - rl->count[rw]) >= - queue_congestion_off_threshold()) + if ((q->nr_requests - rl->count[rw]) >= + queue_congestion_off_threshold(q)) clear_queue_congested(q, rw); - if ((BLKDEV_MAX_RQ - rl->count[rw]) >= batch_requests() && + if ((q->nr_requests - rl->count[rw]) >= batch_requests(q) && waitqueue_active(&rl->wait[rw])) wake_up(&rl->wait[rw]); } @@ -2360,14 +2356,6 @@ int __init blk_dev_init(void) if (!request_cachep) panic("Can't create request pool slab cache\n"); - queue_nr_requests = BLKDEV_MAX_RQ; - - printk("block request queues:\n"); - printk(" %d/%d requests per read queue\n", BLKDEV_MIN_RQ, queue_nr_requests); - printk(" %d/%d requests per write queue\n", BLKDEV_MIN_RQ, queue_nr_requests); - printk(" enter congestion at %d\n", queue_congestion_on_threshold()); - printk(" exit congestion at %d\n", queue_congestion_off_threshold()); - blk_max_low_pfn = max_low_pfn; blk_max_pfn = max_pfn; @@ -2376,6 +2364,153 @@ int __init blk_dev_init(void) return 0; } +/* + * sysfs parts below + */ +struct queue_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct request_queue *, char *); + ssize_t (*store)(struct request_queue *, const char *, size_t); +}; + +static ssize_t +queue_var_show(unsigned int var, char *page) +{ + return sprintf(page, "%d\n", var); +} + +static ssize_t +queue_var_store(unsigned long *var, const char *page, size_t count) +{ + char *p = (char *) page; + + *var = simple_strtoul(p, &p, 10); + return count; +} + +static ssize_t queue_requests_show(struct request_queue *q, char *page) +{ + return queue_var_show(q->nr_requests, (page)); +} + +static ssize_t +queue_requests_store(struct request_queue *q, const char *page, size_t count) +{ + struct request_list *rl = &q->rq; + + int ret = queue_var_store(&q->nr_requests, page, count); + if (q->nr_requests < BLKDEV_MIN_RQ) + q->nr_requests = BLKDEV_MIN_RQ; + + if ((q->nr_requests - rl->count[READ]) < + queue_congestion_on_threshold(q)) + set_queue_congested(q, READ); + else if ((q->nr_requests - rl->count[READ]) >= + queue_congestion_off_threshold(q)) + clear_queue_congested(q, READ); + + if ((q->nr_requests - rl->count[READ]) < + queue_congestion_on_threshold(q)) + set_queue_congested(q, READ); + else if ((q->nr_requests - rl->count[READ]) >= + queue_congestion_off_threshold(q)) + clear_queue_congested(q, READ); + + return ret; +} + +static struct queue_sysfs_entry queue_requests_entry = { + .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, + .show = queue_requests_show, + .store = queue_requests_store, +}; + +static struct attribute *default_attrs[] = { + &queue_requests_entry.attr, + NULL, +}; + +#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr) + +static ssize_t +queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) +{ + struct queue_sysfs_entry *entry = to_queue(attr); + struct request_queue *q; + + q = container_of(kobj, struct request_queue, kobj); + if (!entry->show) + return 0; + + return entry->show(q, page); +} + +static ssize_t +queue_attr_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t length) +{ + struct queue_sysfs_entry *entry = to_queue(attr); + struct request_queue *q; + + q = container_of(kobj, struct request_queue, kobj); + if (!entry->store) + return -EINVAL; + + return entry->store(q, page, length); +} + +static struct sysfs_ops queue_sysfs_ops = { + .show = queue_attr_show, + .store = queue_attr_store, +}; + +struct kobj_type queue_ktype = { + .sysfs_ops = &queue_sysfs_ops, + .default_attrs = default_attrs, +}; + +int blk_register_queue(struct gendisk *disk) +{ + int ret; + + request_queue_t *q = disk->queue; + + if (!q) + return -ENXIO; + + q->kobj.parent = kobject_get(&disk->kobj); + if (!q->kobj.parent) + return -EBUSY; + + snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue"); + q->kobj.ktype = &queue_ktype; + + ret = kobject_register(&q->kobj); + if (ret < 0) + return ret; + + ret = elv_register_queue(q); + if (ret) { + kobject_unregister(&q->kobj); + return ret; + } + + return 0; +} + +void blk_unregister_queue(struct gendisk *disk) +{ + request_queue_t *q = disk->queue; + + if (q) { + elv_unregister_queue(q); + + kobject_unregister(&q->kobj); + kobject_put(&disk->kobj); + } +} + + EXPORT_SYMBOL(process_that_request_first); EXPORT_SYMBOL(end_that_request_first); EXPORT_SYMBOL(end_that_request_chunk); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e97790517973..4295d60bf661 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -22,7 +22,7 @@ typedef struct elevator_s elevator_t; struct request_pm_state; #define BLKDEV_MIN_RQ 4 -#define BLKDEV_MAX_RQ 128 +#define BLKDEV_MAX_RQ 128 /* Default maximum */ struct request_list { int count[2]; @@ -268,9 +268,16 @@ struct request_queue */ spinlock_t *queue_lock; + /* + * queue kobject + */ + struct kobject kobj; + /* * queue settings */ + unsigned long nr_requests; /* Max # of requests */ + unsigned short max_sectors; unsigned short max_phys_segments; unsigned short max_hw_segments; @@ -398,6 +405,8 @@ struct sec_size { unsigned block_size_bits; }; +extern int blk_register_queue(struct gendisk *disk); +extern void blk_unregister_queue(struct gendisk *disk); extern void register_disk(struct gendisk *dev); extern void generic_make_request(struct bio *bio); extern void blk_put_request(struct request *); diff --git a/include/linux/elevator.h b/include/linux/elevator.h index d793bb97dd54..b0e70562be94 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -68,8 +68,8 @@ extern int elv_queue_empty(request_queue_t *); extern struct request *elv_next_request(struct request_queue *q); extern struct request *elv_former_request(request_queue_t *, struct request *); extern struct request *elv_latter_request(request_queue_t *, struct request *); -extern int elv_register_queue(struct gendisk *); -extern void elv_unregister_queue(struct gendisk *); +extern int elv_register_queue(request_queue_t *q); +extern void elv_unregister_queue(request_queue_t *q); extern int elv_may_queue(request_queue_t *, int); extern void elv_completed_request(request_queue_t *, struct request *); extern int elv_set_request(request_queue_t *, struct request *, int); -- cgit v1.2.3 From 80af89ca709d4dfe41178abe29217a0fefa1af12 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 4 Jul 2003 19:37:12 -0700 Subject: [PATCH] block batching fairness From: Nick Piggin This patch fixes the request batching fairness/starvation issue. Its not clear what is going on with 2.4, but it seems that its a problem around this area. Anyway, previously: * request queue fills up * process 1 calls get_request, sleeps * a couple of requests are freed * process 2 calls get_request, proceeds * a couple of requests are freed * process 2 calls get_request... Now as unlikely as it seems, it could be a problem. Its a fairness problem that process 2 can skip ahead of process 1 anyway. With the patch: * request queue fills up * any process calling get_request will sleep * once the queue gets below the batch watermark, processes start being worken, and may allocate. This patch includes Chris Mason's fix to only clear queue_full when all tasks have been woken. Previously I think starvation and unfairness could still occur. With this change to the blk-fair-batches patch, Chris is showing some much improved numbers for 2.4 - 170 ms max wait vs 2700ms without blk-fair-batches for a dbench 90 run. He didn't indicate how much difference his patch alone made, but it is an important fix I think. --- drivers/block/ll_rw_blk.c | 75 +++++++++++++++++++++++++++++++---------------- include/linux/blkdev.h | 26 ++++++++++++++++ 2 files changed, 75 insertions(+), 26 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index 57daaf4aea9d..f7981c1fa3e6 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -53,7 +53,7 @@ unsigned long blk_max_low_pfn, blk_max_pfn; static inline int batch_requests(struct request_queue *q) { - return q->nr_requests - min(q->nr_requests / 8, 8UL); + return q->nr_requests - min(q->nr_requests / 8, 8UL) - 1; } /* @@ -1309,13 +1309,16 @@ static inline struct request *blk_alloc_request(request_queue_t *q,int gfp_mask) /* * Get a free request, queue_lock must not be held */ -static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) +static struct request * +get_request(request_queue_t *q, int rw, int gfp_mask, int force) { struct request *rq = NULL; struct request_list *rl = &q->rq; spin_lock_irq(q->queue_lock); - if (rl->count[rw] >= q->nr_requests && !elv_may_queue(q, rw)) { + if (rl->count[rw] == q->nr_requests) + blk_set_queue_full(q, rw); + if (blk_queue_full(q, rw) && !force && !elv_may_queue(q, rw)) { spin_unlock_irq(q->queue_lock); goto out; } @@ -1330,6 +1333,14 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) rl->count[rw]--; if (rl->count[rw] < queue_congestion_off_threshold(q)) clear_queue_congested(q, rw); + + if (rl->count[rw] <= batch_requests(q)) { + if (waitqueue_active(&rl->wait[rw])) + wake_up(&rl->wait[rw]); + else + blk_clear_queue_full(q, rw); + } + spin_unlock_irq(q->queue_lock); goto out; } @@ -1366,26 +1377,22 @@ static struct request *get_request_wait(request_queue_t *q, int rw) { DEFINE_WAIT(wait); struct request *rq; + int waited = 0; generic_unplug_device(q); do { - rq = get_request(q, rw, GFP_NOIO); + struct request_list *rl = &q->rq; - if (!rq) { - struct request_list *rl = &q->rq; + prepare_to_wait_exclusive(&rl->wait[rw], &wait, + TASK_UNINTERRUPTIBLE); - prepare_to_wait_exclusive(&rl->wait[rw], &wait, - TASK_UNINTERRUPTIBLE); - /* - * If _all_ the requests were suddenly returned then - * no wakeup will be delivered. So now we're on the - * waitqueue, go check for that. - */ - rq = get_request(q, rw, GFP_NOIO); - if (!rq) - io_schedule(); - finish_wait(&rl->wait[rw], &wait); + rq = get_request(q, rw, GFP_NOIO, waited); + + if (!rq) { + io_schedule(); + waited = 1; } + finish_wait(&rl->wait[rw], &wait); } while (!rq); return rq; @@ -1397,10 +1404,10 @@ struct request *blk_get_request(request_queue_t *q, int rw, int gfp_mask) BUG_ON(rw != READ && rw != WRITE); - rq = get_request(q, rw, gfp_mask); - - if (!rq && (gfp_mask & __GFP_WAIT)) + if (gfp_mask & __GFP_WAIT) rq = get_request_wait(q, rw); + else + rq = get_request(q, rw, gfp_mask, 0); return rq; } @@ -1551,9 +1558,13 @@ void __blk_put_request(request_queue_t *q, struct request *req) rl->count[rw]--; if (rl->count[rw] < queue_congestion_off_threshold(q)) clear_queue_congested(q, rw); - if (rl->count[rw] < batch_requests(q) && - waitqueue_active(&rl->wait[rw])) - wake_up(&rl->wait[rw]); + + if (rl->count[rw] <= batch_requests(q)) { + if (waitqueue_active(&rl->wait[rw])) + wake_up(&rl->wait[rw]); + else + blk_clear_queue_full(q, rw); + } } } @@ -1796,7 +1807,7 @@ get_rq: freereq = NULL; } else { spin_unlock_irq(q->queue_lock); - if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) { + if ((freereq = get_request(q, rw, GFP_ATOMIC, 0)) == NULL) { /* * READA bit set */ @@ -1904,8 +1915,7 @@ static inline void blk_partition_remap(struct bio *bio) * bio happens to be merged with someone else, and may change bi_dev and * bi_sector for remaps as it sees fit. So the values of these fields * should NOT be depended on after the call to generic_make_request. - * - * */ + */ void generic_make_request(struct bio *bio) { request_queue_t *q; @@ -2415,6 +2425,19 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) else if (rl->count[WRITE] < queue_congestion_off_threshold(q)) clear_queue_congested(q, WRITE); + if (rl->count[READ] >= q->nr_requests) { + blk_set_queue_full(q, READ); + } else if (rl->count[READ] <= batch_requests(q)) { + blk_clear_queue_full(q, READ); + wake_up_all(&rl->wait[READ]); + } + + if (rl->count[WRITE] >= q->nr_requests) { + blk_set_queue_full(q, WRITE); + } else if (rl->count[WRITE] <= batch_requests(q)) { + blk_clear_queue_full(q, WRITE); + wake_up_all(&rl->wait[WRITE]); + } return ret; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4295d60bf661..d3a8f6ecd806 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -307,6 +307,8 @@ struct request_queue #define QUEUE_FLAG_CLUSTER 0 /* cluster several segments into 1 */ #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ #define QUEUE_FLAG_STOPPED 2 /* queue is stopped */ +#define QUEUE_FLAG_READFULL 3 /* write queue has been filled */ +#define QUEUE_FLAG_WRITEFULL 4 /* read queue has been filled */ #define blk_queue_plugged(q) !list_empty(&(q)->plug_list) #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) @@ -322,6 +324,30 @@ struct request_queue #define rq_data_dir(rq) ((rq)->flags & 1) +static inline int blk_queue_full(struct request_queue *q, int rw) +{ + if (rw == READ) + return test_bit(QUEUE_FLAG_READFULL, &q->queue_flags); + return test_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags); +} + +static inline void blk_set_queue_full(struct request_queue *q, int rw) +{ + if (rw == READ) + set_bit(QUEUE_FLAG_READFULL, &q->queue_flags); + else + set_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags); +} + +static inline void blk_clear_queue_full(struct request_queue *q, int rw) +{ + if (rw == READ) + clear_bit(QUEUE_FLAG_READFULL, &q->queue_flags); + else + clear_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags); +} + + /* * mergeable request must not have _NOMERGE or _BARRIER bit set, nor may * it already be started by driver. -- cgit v1.2.3 From 16f88dbdbffa3dc52b959706e6a311a932b51ed6 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 4 Jul 2003 19:37:19 -0700 Subject: [PATCH] generic io contexts From: Nick Piggin Generalise the AS-specific per-process IO context so that other IO schedulers could use it. --- drivers/block/as-iosched.c | 254 +++++++++++++++++---------------------------- drivers/block/ll_rw_blk.c | 88 ++++++++++++++++ include/linux/blkdev.h | 44 ++++++++ include/linux/sched.h | 6 +- kernel/exit.c | 4 +- kernel/fork.c | 2 +- 6 files changed, 233 insertions(+), 165 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/drivers/block/as-iosched.c b/drivers/block/as-iosched.c index 2e5e64fb3b39..d63c92dfcf96 100644 --- a/drivers/block/as-iosched.c +++ b/drivers/block/as-iosched.c @@ -59,14 +59,6 @@ */ #define default_antic_expire ((HZ / 150) ? HZ / 150 : 1) -/* - * This is the per-process anticipatory I/O scheduler state. It is refcounted - * and kmalloc'ed. - * - * There is no locking protecting the contents of this structure! Pointers - * to a single as_io_context may appear in multiple queues at once. - */ - /* * Keep track of up to 20ms thinktimes. We can go as big as we like here, * however huge values tend to interfere and not decay fast enough. A program @@ -82,28 +74,6 @@ enum as_io_states { AS_TASK_IORUNNING, /* Process has completed some IO */ }; -struct as_io_context { - atomic_t refcount; - pid_t pid; - unsigned long state; - atomic_t nr_queued; /* queued reads & sync writes */ - atomic_t nr_dispatched; /* number of requests gone to the drivers */ - - spinlock_t lock; - - /* IO History tracking */ - /* Thinktime */ - unsigned long last_end_request; - unsigned long ttime_total; - unsigned long ttime_samples; - unsigned long ttime_mean; - /* Layout pattern */ - long seek_samples; - sector_t last_request_pos; - sector_t seek_total; - sector_t seek_mean; -}; - enum anticipation_status { ANTIC_OFF=0, /* Not anticipating (normal operation) */ ANTIC_WAIT_REQ, /* The last read has not yet completed */ @@ -144,8 +114,8 @@ struct as_data { unsigned long antic_start; /* jiffies: when it started */ struct timer_list antic_timer; /* anticipatory scheduling timer */ struct work_struct antic_work; /* Deferred unplugging */ - struct as_io_context *as_io_context;/* Identify the expected process */ - int aic_finished; /* IO associated with as_io_context finished */ + struct io_context *io_context; /* Identify the expected process */ + int ioc_finished; /* IO associated with io_context is finished */ int nr_dispatched; /* @@ -178,7 +148,7 @@ struct as_rq { struct request *request; - struct as_io_context *as_io_context; /* The submitting task */ + struct io_context *io_context; /* The submitting task */ /* * request hash, key is the ending offset (for back merge lookup) @@ -206,99 +176,55 @@ static kmem_cache_t *arq_pool; /* Debug */ static atomic_t nr_as_io_requests = ATOMIC_INIT(0); -static void put_as_io_context(struct as_io_context **paic) +/* Called to deallocate the as_io_context */ +static void free_as_io_context(struct as_io_context *aic) { - struct as_io_context *aic = *paic; - - if (aic == NULL) - return; - - BUG_ON(atomic_read(&aic->refcount) == 0); - - if (atomic_dec_and_test(&aic->refcount)) { - WARN_ON(atomic_read(&nr_as_io_requests) == 0); - atomic_dec(&nr_as_io_requests); - kfree(aic); - } + atomic_dec(&nr_as_io_requests); + kfree(aic); } -/* Called by the exitting task */ -void exit_as_io_context(void) +/* Called when the task exits */ +static void exit_as_io_context(struct as_io_context *aic) { - unsigned long flags; - struct as_io_context *aic; - - local_irq_save(flags); - aic = current->as_io_context; - if (aic) { - clear_bit(AS_TASK_RUNNING, &aic->state); - put_as_io_context(&aic); - current->as_io_context = NULL; - } - local_irq_restore(flags); + clear_bit(AS_TASK_RUNNING, &aic->state); } -/* - * If the current task has no IO context then create one and initialise it. - * If it does have a context, take a ref on it. - * - * This is always called in the context of the task which submitted the I/O. - * But weird things happen, so we disable local interrupts to ensure exclusive - * access to *current. - */ -static struct as_io_context *get_as_io_context(void) +static struct as_io_context *alloc_as_io_context(void) { - struct task_struct *tsk = current; - unsigned long flags; struct as_io_context *ret; - local_irq_save(flags); - ret = tsk->as_io_context; - if (ret == NULL) { - ret = kmalloc(sizeof(*ret), GFP_ATOMIC); - if (ret) { - atomic_inc(&nr_as_io_requests); - atomic_set(&ret->refcount, 1); - ret->pid = tsk->pid; - ret->state = 1 << AS_TASK_RUNNING; - atomic_set(&ret->nr_queued, 0); - atomic_set(&ret->nr_dispatched, 0); - spin_lock_init(&ret->lock); - ret->ttime_total = 0; - ret->ttime_samples = 0; - ret->ttime_mean = 0; - ret->seek_total = 0; - ret->seek_samples = 0; - ret->seek_mean = 0; - tsk->as_io_context = ret; - } + ret = kmalloc(sizeof(*ret), GFP_ATOMIC); + if (ret) { + atomic_inc(&nr_as_io_requests); + ret->dtor = free_as_io_context; + ret->exit = exit_as_io_context; + ret->state = 1 << AS_TASK_RUNNING; + atomic_set(&ret->nr_queued, 0); + atomic_set(&ret->nr_dispatched, 0); + spin_lock_init(&ret->lock); + ret->ttime_total = 0; + ret->ttime_samples = 0; + ret->ttime_mean = 0; + ret->seek_total = 0; + ret->seek_samples = 0; + ret->seek_mean = 0; } - local_irq_restore(flags); - atomic_inc(&ret->refcount); + return ret; } -static void -copy_as_io_context(struct as_io_context **pdst, struct as_io_context **psrc) +/* + * If the current task has no AS IO context then create one and initialise it. + * Then take a ref on the task's io context and return it. + */ +static struct io_context *as_get_io_context(void) { - struct as_io_context *src = *psrc; - - if (src) { - BUG_ON(atomic_read(&src->refcount) == 0); - atomic_inc(&src->refcount); - put_as_io_context(pdst); - *pdst = src; - } + struct io_context *ioc = get_io_context(); + if (ioc && !ioc->aic) + ioc->aic = alloc_as_io_context(); + return ioc; } -static void -swap_as_io_context(struct as_io_context **aic1, struct as_io_context **aic2) -{ - struct as_io_context *temp; - temp = *aic1; - *aic1 = *aic2; - *aic2 = temp; -} /* * the back merge hash support functions @@ -662,7 +588,7 @@ static void as_antic_waitreq(struct as_data *ad) { BUG_ON(ad->antic_status == ANTIC_FINISHED); if (ad->antic_status == ANTIC_OFF) { - if (!ad->as_io_context || ad->aic_finished) + if (!ad->io_context || ad->ioc_finished) as_antic_waitnext(ad); else ad->antic_status = ANTIC_WAIT_REQ; @@ -715,7 +641,7 @@ static int as_close_req(struct as_data *ad, struct as_rq *arq) sector_t next = arq->request->sector; sector_t delta; /* acceptable close offset (in sectors) */ - if (ad->antic_status == ANTIC_OFF || !ad->aic_finished) + if (ad->antic_status == ANTIC_OFF || !ad->ioc_finished) delay = 0; else delay = ((jiffies - ad->antic_start) * 1000) / HZ; @@ -745,6 +671,7 @@ static int as_close_req(struct as_data *ad, struct as_rq *arq) */ static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq) { + struct io_context *ioc; struct as_io_context *aic; if (arq && arq->is_sync == REQ_SYNC && as_close_req(ad, arq)) { @@ -752,7 +679,7 @@ static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq) return 1; } - if (ad->aic_finished && as_antic_expired(ad)) { + if (ad->ioc_finished && as_antic_expired(ad)) { /* * In this situation status should really be FINISHED, * however the timer hasn't had the chance to run yet. @@ -760,14 +687,18 @@ static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq) return 1; } - aic = ad->as_io_context; - BUG_ON(!aic); + ioc = ad->io_context; + BUG_ON(!ioc); - if (arq && aic == arq->as_io_context) { + if (arq && ioc == arq->io_context) { /* request from same process */ return 1; } + aic = ioc->aic; + if (!aic) + return 0; + if (!test_bit(AS_TASK_RUNNING, &aic->state)) { /* process anticipated on has exitted */ return 1; @@ -810,7 +741,7 @@ static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq) */ static int as_can_anticipate(struct as_data *ad, struct as_rq *arq) { - if (!ad->as_io_context) + if (!ad->io_context) /* * Last request submitted was a write */ @@ -973,12 +904,10 @@ static void as_completed_request(request_queue_t *q, struct request *rq) { struct as_data *ad = q->elevator.elevator_data; struct as_rq *arq = RQ_DATA(rq); - struct as_io_context *aic = arq->as_io_context; + struct as_io_context *aic; - if (unlikely(!blk_fs_request(rq))) { - WARN_ON(aic); + if (unlikely(!blk_fs_request(rq))) return; - } WARN_ON(blk_fs_request(rq) && arq->state == AS_RQ_NEW); @@ -1004,18 +933,12 @@ static void as_completed_request(request_queue_t *q, struct request *rq) ad->changed_batch = 0; } - if (!aic) + if (!arq->io_context) return; - spin_lock(&aic->lock); - if (arq->is_sync == REQ_SYNC) { - set_bit(AS_TASK_IORUNNING, &aic->state); - aic->last_end_request = jiffies; - } - - if (ad->as_io_context == aic) { + if (ad->io_context == arq->io_context) { ad->antic_start = jiffies; - ad->aic_finished = 1; + ad->ioc_finished = 1; if (ad->antic_status == ANTIC_WAIT_REQ) { /* * We were waiting on this request, now anticipate @@ -1024,9 +947,19 @@ static void as_completed_request(request_queue_t *q, struct request *rq) as_antic_waitnext(ad); } } + + aic = arq->io_context->aic; + if (!aic) + return; + + spin_lock(&aic->lock); + if (arq->is_sync == REQ_SYNC) { + set_bit(AS_TASK_IORUNNING, &aic->state); + aic->last_end_request = jiffies; + } spin_unlock(&aic->lock); - put_as_io_context(&arq->as_io_context); + put_io_context(arq->io_context); } /* @@ -1047,9 +980,9 @@ static void as_remove_queued_request(request_queue_t *q, struct request *rq) WARN_ON(arq->state != AS_RQ_QUEUED); - if (arq->as_io_context) { - BUG_ON(!atomic_read(&arq->as_io_context->nr_queued)); - atomic_dec(&arq->as_io_context->nr_queued); + if (arq->io_context && arq->io_context->aic) { + BUG_ON(!atomic_read(&arq->io_context->aic->nr_queued)); + atomic_dec(&arq->io_context->aic->nr_queued); } /* @@ -1082,10 +1015,12 @@ static void as_remove_dispatched_request(request_queue_t *q, struct request *rq) WARN_ON(arq->state != AS_RQ_DISPATCHED); WARN_ON(ON_RB(&arq->rb_node)); - aic = arq->as_io_context; - if (aic) { - WARN_ON(!atomic_read(&aic->nr_dispatched)); - atomic_dec(&aic->nr_dispatched); + if (arq->io_context && arq->io_context->aic) { + aic = arq->io_context->aic; + if (aic) { + WARN_ON(!atomic_read(&aic->nr_dispatched)); + atomic_dec(&aic->nr_dispatched); + } } } /* @@ -1180,17 +1115,17 @@ static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq) if (data_dir == REQ_SYNC) { /* In case we have to anticipate after this */ - copy_as_io_context(&ad->as_io_context, &arq->as_io_context); + copy_io_context(&ad->io_context, &arq->io_context); } else { - if (ad->as_io_context) { - put_as_io_context(&ad->as_io_context); - ad->as_io_context = NULL; + if (ad->io_context) { + put_io_context(ad->io_context); + ad->io_context = NULL; } if (ad->current_write_count != 0) ad->current_write_count--; } - ad->aic_finished = 0; + ad->ioc_finished = 0; ad->next_arq[data_dir] = as_find_next_arq(ad, arq); @@ -1199,8 +1134,8 @@ static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq) */ as_remove_queued_request(ad->q, arq->request); list_add_tail(&arq->request->queuelist, ad->dispatch); - if (arq->as_io_context) - atomic_inc(&arq->as_io_context->nr_dispatched); + if (arq->io_context && arq->io_context->aic) + atomic_inc(&arq->io_context->aic->nr_dispatched); WARN_ON(arq->state != AS_RQ_QUEUED); arq->state = AS_RQ_DISPATCHED; @@ -1355,11 +1290,11 @@ static void as_add_request(struct as_data *ad, struct as_rq *arq) arq->is_sync = 0; data_dir = arq->is_sync; - arq->as_io_context = get_as_io_context(); + arq->io_context = as_get_io_context(); - if (arq->as_io_context) { - atomic_inc(&arq->as_io_context->nr_queued); - as_update_iohist(arq->as_io_context, arq->request); + if (arq->io_context && arq->io_context->aic) { + atomic_inc(&arq->io_context->aic->nr_queued); + as_update_iohist(arq->io_context->aic, arq->request); } as_add_arq_rb(ad, arq); @@ -1575,8 +1510,7 @@ as_merged_requests(request_queue_t *q, struct request *req, * Don't copy here but swap, because when anext is * removed below, it must contain the unused context */ - swap_as_io_context(&arq->as_io_context, - &anext->as_io_context); + swap_io_context(&arq->io_context, &anext->io_context); } } @@ -1584,7 +1518,7 @@ as_merged_requests(request_queue_t *q, struct request *req, * kill knowledge of next, this one is a goner */ as_remove_queued_request(q, next); - put_as_io_context(&anext->as_io_context); + put_io_context(anext->io_context); } /* @@ -1630,7 +1564,7 @@ static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask) RB_CLEAR(&arq->rb_node); arq->request = rq; arq->state = AS_RQ_NEW; - arq->as_io_context = NULL; + arq->io_context = NULL; INIT_LIST_HEAD(&arq->hash); arq->hash_valid_count = 0; INIT_LIST_HEAD(&arq->fifo); @@ -1643,16 +1577,18 @@ static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask) static int as_may_queue(request_queue_t *q, int rw) { + int ret = 0; struct as_data *ad = q->elevator.elevator_data; - struct as_io_context *aic; + struct io_context *ioc; if (ad->antic_status == ANTIC_WAIT_REQ || ad->antic_status == ANTIC_WAIT_NEXT) { - aic = get_as_io_context(); - if (ad->as_io_context == aic) - return 1; + ioc = as_get_io_context(); + if (ad->io_context == ioc) + ret = 1; + put_io_context(ioc); } - return 0; + return ret; } static void as_exit(request_queue_t *q, elevator_t *e) @@ -1666,7 +1602,7 @@ static void as_exit(request_queue_t *q, elevator_t *e) BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC])); mempool_destroy(ad->arq_pool); - put_as_io_context(&ad->as_io_context); + put_io_context(ad->io_context); kfree(ad->hash); kfree(ad); } diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index f7981c1fa3e6..8f44b5690d9a 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -1318,6 +1318,7 @@ get_request(request_queue_t *q, int rw, int gfp_mask, int force) spin_lock_irq(q->queue_lock); if (rl->count[rw] == q->nr_requests) blk_set_queue_full(q, rw); + if (blk_queue_full(q, rw) && !force && !elv_may_queue(q, rw)) { spin_unlock_irq(q->queue_lock); goto out; @@ -2377,6 +2378,93 @@ int __init blk_dev_init(void) return 0; } + +/* + * IO Context helper functions + */ +void put_io_context(struct io_context *ioc) +{ + if (ioc == NULL) + return; + + BUG_ON(atomic_read(&ioc->refcount) == 0); + + if (atomic_dec_and_test(&ioc->refcount)) { + if (ioc->aic && ioc->aic->dtor) + ioc->aic->dtor(ioc->aic); + kfree(ioc); + } +} + +/* Called by the exitting task */ +void exit_io_context(void) +{ + unsigned long flags; + struct io_context *ioc; + + local_irq_save(flags); + ioc = current->io_context; + if (ioc) { + if (ioc->aic && ioc->aic->exit) + ioc->aic->exit(ioc->aic); + put_io_context(ioc); + current->io_context = NULL; + } + local_irq_restore(flags); +} + +/* + * If the current task has no IO context then create one and initialise it. + * If it does have a context, take a ref on it. + * + * This is always called in the context of the task which submitted the I/O. + * But weird things happen, so we disable local interrupts to ensure exclusive + * access to *current. + */ +struct io_context *get_io_context(void) +{ + struct task_struct *tsk = current; + unsigned long flags; + struct io_context *ret; + + local_irq_save(flags); + ret = tsk->io_context; + if (ret == NULL) { + ret = kmalloc(sizeof(*ret), GFP_ATOMIC); + if (ret) { + atomic_set(&ret->refcount, 1); + ret->pid = tsk->pid; + ret->aic = NULL; + tsk->io_context = ret; + } + } + local_irq_restore(flags); + atomic_inc(&ret->refcount); + return ret; +} + +void copy_io_context(struct io_context **pdst, struct io_context **psrc) +{ + struct io_context *src = *psrc; + struct io_context *dst = *pdst; + + if (src) { + BUG_ON(atomic_read(&src->refcount) == 0); + atomic_inc(&src->refcount); + put_io_context(dst); + *pdst = src; + } +} + +void swap_io_context(struct io_context **ioc1, struct io_context **ioc2) +{ + struct io_context *temp; + temp = *ioc1; + *ioc1 = *ioc2; + *ioc2 = temp; +} + + /* * sysfs parts below */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d3a8f6ecd806..13116a7a7969 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -24,6 +24,50 @@ struct request_pm_state; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ +/* + * This is the per-process anticipatory I/O scheduler state. + */ +struct as_io_context { + spinlock_t lock; + + void (*dtor)(struct as_io_context *aic); /* destructor */ + void (*exit)(struct as_io_context *aic); /* called on task exit */ + + unsigned long state; + atomic_t nr_queued; /* queued reads & sync writes */ + atomic_t nr_dispatched; /* number of requests gone to the drivers */ + + /* IO History tracking */ + /* Thinktime */ + unsigned long last_end_request; + unsigned long ttime_total; + unsigned long ttime_samples; + unsigned long ttime_mean; + /* Layout pattern */ + long seek_samples; + sector_t last_request_pos; + sector_t seek_total; + sector_t seek_mean; +}; + +/* + * This is the per-process I/O subsystem state. It is refcounted and + * kmalloc'ed. Currently all fields are modified in process io context + * (apart from the atomic refcount), so require no locking. + */ +struct io_context { + atomic_t refcount; + pid_t pid; + + struct as_io_context *aic; +}; + +void put_io_context(struct io_context *ioc); +void exit_io_context(void); +struct io_context *get_io_context(void); +void copy_io_context(struct io_context **pdst, struct io_context **psrc); +void swap_io_context(struct io_context **ioc1, struct io_context **ioc2); + struct request_list { int count[2]; mempool_t *rq_pool; diff --git a/include/linux/sched.h b/include/linux/sched.h index e29f9606c2aa..750f2a12cada 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -321,8 +321,8 @@ struct k_itimer { }; -struct as_io_context; /* Anticipatory scheduler */ -void exit_as_io_context(void); +struct io_context; /* See blkdev.h */ +void exit_io_context(void); struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ @@ -452,7 +452,7 @@ struct task_struct { struct dentry *proc_dentry; struct backing_dev_info *backing_dev_info; - struct as_io_context *as_io_context; + struct io_context *io_context; unsigned long ptrace_message; siginfo_t *last_siginfo; /* For ptrace use. */ diff --git a/kernel/exit.c b/kernel/exit.c index 8471381546af..ebc839b645a7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -682,8 +682,8 @@ NORET_TYPE void do_exit(long code) panic("Attempted to kill the idle task!"); if (unlikely(tsk->pid == 1)) panic("Attempted to kill init!"); - if (tsk->as_io_context) - exit_as_io_context(); + if (tsk->io_context) + exit_io_context(); tsk->flags |= PF_EXITING; del_timer_sync(&tsk->real_timer); diff --git a/kernel/fork.c b/kernel/fork.c index fcdc884cd894..96ce3385cc75 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -864,7 +864,7 @@ struct task_struct *copy_process(unsigned long clone_flags, p->lock_depth = -1; /* -1 = no lock */ p->start_time = get_jiffies_64(); p->security = NULL; - p->as_io_context = NULL; + p->io_context = NULL; retval = -ENOMEM; if ((retval = security_task_alloc(p))) -- cgit v1.2.3 From 930805a244eaadb5aefbc08b558db72136128388 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 4 Jul 2003 19:37:26 -0700 Subject: [PATCH] block request batching From: Nick Piggin The following patch gets batching working how it should be. After a process is woken up, it is allowed to allocate up to 32 requests for 20ms. It does not stop other processes submitting requests if it isn't submitting though. This should allow less context switches, and allow batches of requests from each process to be sent to the io scheduler instead of 1 request from each process. tiobench sequential writes are more than tripled, random writes are nearly doubled over mm1. In earlier tests I generally saw better CPU efficiency but it doesn't show here. There is still debug to be taken out. Its also only on UP. Avg Maximum Lat% Lat% CPU Identifier Rate (CPU%) Latency Latency >2s >10s Eff ------------------- ------ --------- ---------- ------- ------ ---- -2.5.71-mm1 11.13 3.783% 46.10 24668.01 0.84 0.02 294 +2.5.71-mm1 13.21 4.489% 37.37 5691.66 0.76 0.00 294 Random Reads ------------------- ------ --------- ---------- ------- ------ ---- -2.5.71-mm1 0.97 0.582% 519.86 6444.66 11.93 0.00 167 +2.5.71-mm1 1.01 0.604% 484.59 6604.93 10.73 0.00 167 Sequential Writes ------------------- ------ --------- ---------- ------- ------ ---- -2.5.71-mm1 4.85 4.456% 77.80 99359.39 0.18 0.13 109 +2.5.71-mm1 14.11 14.19% 10.07 22805.47 0.09 0.04 99 Random Writes ------------------- ------ --------- ---------- ------- ------ ---- -2.5.71-mm1 0.46 0.371% 14.48 6173.90 0.23 0.00 125 +2.5.71-mm1 0.86 0.744% 24.08 8753.66 0.31 0.00 115 It decreases context switch rate on IBM's 8-way on ext2 tiobench 64 threads from ~2500/s to ~140/s on their regression tests. --- drivers/block/ll_rw_blk.c | 132 ++++++++++++++++++++++++++++++---------------- include/linux/blkdev.h | 6 +++ 2 files changed, 94 insertions(+), 44 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index 8f44b5690d9a..633266ee8c87 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -51,10 +51,11 @@ static struct workqueue_struct *kblockd_workqueue; unsigned long blk_max_low_pfn, blk_max_pfn; -static inline int batch_requests(struct request_queue *q) -{ - return q->nr_requests - min(q->nr_requests / 8, 8UL) - 1; -} +/* Amount of time in which a process may batch requests */ +#define BLK_BATCH_TIME (HZ/50UL) + +/* Number of requests a "batching" process may submit */ +#define BLK_BATCH_REQ 32 /* * Return the threshold (number of used requests) at which the queue is @@ -1305,24 +1306,76 @@ static inline struct request *blk_alloc_request(request_queue_t *q,int gfp_mask) return NULL; } +/* + * ioc_batching returns true if the ioc is a valid batching request and + * should be given priority access to a request. + */ +static inline int ioc_batching(struct io_context *ioc) +{ + if (!ioc) + return 0; + + return ioc->nr_batch_requests == BLK_BATCH_REQ || + (ioc->nr_batch_requests > 0 + && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME)); +} + +/* + * ioc_set_batching sets ioc to be a new "batcher" if it is not one + */ +void ioc_set_batching(struct io_context *ioc) +{ + if (!ioc || ioc_batching(ioc)) + return; + + ioc->nr_batch_requests = BLK_BATCH_REQ; + ioc->last_waited = jiffies; +} + +/* + * A request has just been released. Account for it, update the full and + * congestion status, wake up any waiters. Called under q->queue_lock. + */ +static void freed_request(request_queue_t *q, int rw) +{ + struct request_list *rl = &q->rq; + + rl->count[rw]--; + if (rl->count[rw] < queue_congestion_off_threshold(q)) + clear_queue_congested(q, rw); + if (rl->count[rw]+1 <= q->nr_requests) { + smp_mb(); + if (waitqueue_active(&rl->wait[rw])) + wake_up(&rl->wait[rw]); + if (!waitqueue_active(&rl->wait[rw])) + blk_clear_queue_full(q, rw); + } +} + #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist) /* * Get a free request, queue_lock must not be held */ -static struct request * -get_request(request_queue_t *q, int rw, int gfp_mask, int force) +static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) { struct request *rq = NULL; struct request_list *rl = &q->rq; + struct io_context *ioc = get_io_context(); spin_lock_irq(q->queue_lock); - if (rl->count[rw] == q->nr_requests) - blk_set_queue_full(q, rw); + if (rl->count[rw]+1 >= q->nr_requests) { + if (!blk_queue_full(q, rw)) { + ioc_set_batching(ioc); + blk_set_queue_full(q, rw); + } + } - if (blk_queue_full(q, rw) && !force && !elv_may_queue(q, rw)) { + if (blk_queue_full(q, rw) + && !ioc_batching(ioc) && !elv_may_queue(q, rw)) { spin_unlock_irq(q->queue_lock); goto out; } + rl->count[rw]++; if (rl->count[rw] >= queue_congestion_on_threshold(q)) set_queue_congested(q, rw); @@ -1331,20 +1384,13 @@ get_request(request_queue_t *q, int rw, int gfp_mask, int force) rq = blk_alloc_request(q, gfp_mask); if (!rq) { spin_lock_irq(q->queue_lock); - rl->count[rw]--; - if (rl->count[rw] < queue_congestion_off_threshold(q)) - clear_queue_congested(q, rw); - - if (rl->count[rw] <= batch_requests(q)) { - if (waitqueue_active(&rl->wait[rw])) - wake_up(&rl->wait[rw]); - else - blk_clear_queue_full(q, rw); - } - + freed_request(q, rw); spin_unlock_irq(q->queue_lock); goto out; } + + if (ioc_batching(ioc)) + ioc->nr_batch_requests--; INIT_LIST_HEAD(&rq->queuelist); @@ -1367,6 +1413,7 @@ get_request(request_queue_t *q, int rw, int gfp_mask, int force) rq->sense = NULL; out: + put_io_context(ioc); return rq; } @@ -1378,7 +1425,6 @@ static struct request *get_request_wait(request_queue_t *q, int rw) { DEFINE_WAIT(wait); struct request *rq; - int waited = 0; generic_unplug_device(q); do { @@ -1387,11 +1433,15 @@ static struct request *get_request_wait(request_queue_t *q, int rw) prepare_to_wait_exclusive(&rl->wait[rw], &wait, TASK_UNINTERRUPTIBLE); - rq = get_request(q, rw, GFP_NOIO, waited); + rq = get_request(q, rw, GFP_NOIO); if (!rq) { + struct io_context *ioc; + io_schedule(); - waited = 1; + ioc = get_io_context(); + ioc_set_batching(ioc); + put_io_context(ioc); } finish_wait(&rl->wait[rw], &wait); } while (!rq); @@ -1408,7 +1458,7 @@ struct request *blk_get_request(request_queue_t *q, int rw, int gfp_mask) if (gfp_mask & __GFP_WAIT) rq = get_request_wait(q, rw); else - rq = get_request(q, rw, gfp_mask, 0); + rq = get_request(q, rw, gfp_mask); return rq; } @@ -1555,17 +1605,7 @@ void __blk_put_request(request_queue_t *q, struct request *req) BUG_ON(!list_empty(&req->queuelist)); blk_free_request(q, req); - - rl->count[rw]--; - if (rl->count[rw] < queue_congestion_off_threshold(q)) - clear_queue_congested(q, rw); - - if (rl->count[rw] <= batch_requests(q)) { - if (waitqueue_active(&rl->wait[rw])) - wake_up(&rl->wait[rw]); - else - blk_clear_queue_full(q, rw); - } + freed_request(q, rw); } } @@ -1808,7 +1848,7 @@ get_rq: freereq = NULL; } else { spin_unlock_irq(q->queue_lock); - if ((freereq = get_request(q, rw, GFP_ATOMIC, 0)) == NULL) { + if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) { /* * READA bit set */ @@ -1852,13 +1892,12 @@ out: __blk_put_request(q, freereq); if (blk_queue_plugged(q)) { - int nr_queued = q->rq.count[0] + q->rq.count[1]; + int nr_queued = q->rq.count[READ] + q->rq.count[WRITE]; if (nr_queued == q->unplug_thresh) __generic_unplug_device(q); } spin_unlock_irq(q->queue_lock); - return 0; end_io: @@ -1866,7 +1905,6 @@ end_io: return 0; } - /* * If bio->bi_dev is a partition, remap the location */ @@ -2378,6 +2416,7 @@ int __init blk_dev_init(void) return 0; } +static atomic_t nr_io_contexts = ATOMIC_INIT(0); /* * IO Context helper functions @@ -2393,6 +2432,7 @@ void put_io_context(struct io_context *ioc) if (ioc->aic && ioc->aic->dtor) ioc->aic->dtor(ioc->aic); kfree(ioc); + atomic_dec(&nr_io_contexts); } } @@ -2409,7 +2449,8 @@ void exit_io_context(void) ioc->aic->exit(ioc->aic); put_io_context(ioc); current->io_context = NULL; - } + } else + WARN_ON(1); local_irq_restore(flags); } @@ -2432,8 +2473,11 @@ struct io_context *get_io_context(void) if (ret == NULL) { ret = kmalloc(sizeof(*ret), GFP_ATOMIC); if (ret) { + atomic_inc(&nr_io_contexts); atomic_set(&ret->refcount, 1); ret->pid = tsk->pid; + ret->last_waited = jiffies; /* doesn't matter... */ + ret->nr_batch_requests = 0; /* because this is 0 */ ret->aic = NULL; tsk->io_context = ret; } @@ -2515,16 +2559,16 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) if (rl->count[READ] >= q->nr_requests) { blk_set_queue_full(q, READ); - } else if (rl->count[READ] <= batch_requests(q)) { + } else if (rl->count[READ]+1 <= q->nr_requests) { blk_clear_queue_full(q, READ); - wake_up_all(&rl->wait[READ]); + wake_up(&rl->wait[READ]); } if (rl->count[WRITE] >= q->nr_requests) { blk_set_queue_full(q, WRITE); - } else if (rl->count[WRITE] <= batch_requests(q)) { + } else if (rl->count[WRITE]+1 <= q->nr_requests) { blk_clear_queue_full(q, WRITE); - wake_up_all(&rl->wait[WRITE]); + wake_up(&rl->wait[WRITE]); } return ret; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 13116a7a7969..69178ca80d7d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -59,6 +59,12 @@ struct io_context { atomic_t refcount; pid_t pid; + /* + * For request batching + */ + unsigned long last_waited; /* Time last woken after wait for request */ + int nr_batch_requests; /* Number of requests left in the batch */ + struct as_io_context *aic; }; -- cgit v1.2.3 From 07581dd2bdd67146d13a61ca6506c6c8b694666a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 4 Jul 2003 19:37:34 -0700 Subject: [PATCH] get_io_context fixes - pass gfp_flags to get_io_context(): not all callers are forced to use GFP_ATOMIC(). - fix locking in get_io_context(): bump the refcount whilein the exclusive region. - don't go oops in get_io_context() if the kmalloc failed. - in as_get_io_context(): fail the whole thing if we were unable to allocate the AS-specific part. - as_remove_queued_request() cleanup --- drivers/block/as-iosched.c | 50 ++++++++++++++++++++++------------------------ drivers/block/ll_rw_blk.c | 9 +++++---- include/linux/blkdev.h | 2 +- 3 files changed, 30 insertions(+), 31 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/drivers/block/as-iosched.c b/drivers/block/as-iosched.c index d63c92dfcf96..b19289348fb0 100644 --- a/drivers/block/as-iosched.c +++ b/drivers/block/as-iosched.c @@ -219,13 +219,17 @@ static struct as_io_context *alloc_as_io_context(void) */ static struct io_context *as_get_io_context(void) { - struct io_context *ioc = get_io_context(); - if (ioc && !ioc->aic) + struct io_context *ioc = get_io_context(GFP_ATOMIC); + if (ioc && !ioc->aic) { ioc->aic = alloc_as_io_context(); + if (!ioc->aic) { + put_io_context(ioc); + ioc = NULL; + } + } return ioc; } - /* * the back merge hash support functions */ @@ -971,32 +975,26 @@ static void as_completed_request(request_queue_t *q, struct request *rq) static void as_remove_queued_request(request_queue_t *q, struct request *rq) { struct as_rq *arq = RQ_DATA(rq); + const int data_dir = arq->is_sync; + struct as_data *ad = q->elevator.elevator_data; - if (!arq) - BUG(); - else { - const int data_dir = arq->is_sync; - struct as_data *ad = q->elevator.elevator_data; - - WARN_ON(arq->state != AS_RQ_QUEUED); - - if (arq->io_context && arq->io_context->aic) { - BUG_ON(!atomic_read(&arq->io_context->aic->nr_queued)); - atomic_dec(&arq->io_context->aic->nr_queued); - } - - /* - * Update the "next_arq" cache if we are about to remove its - * entry - */ - if (ad->next_arq[data_dir] == arq) - ad->next_arq[data_dir] = as_find_next_arq(ad, arq); + WARN_ON(arq->state != AS_RQ_QUEUED); - list_del_init(&arq->fifo); - as_remove_merge_hints(q, arq); - as_del_arq_rb(ad, arq); + if (arq->io_context && arq->io_context->aic) { + BUG_ON(!atomic_read(&arq->io_context->aic->nr_queued)); + atomic_dec(&arq->io_context->aic->nr_queued); } + /* + * Update the "next_arq" cache if we are about to remove its + * entry + */ + if (ad->next_arq[data_dir] == arq) + ad->next_arq[data_dir] = as_find_next_arq(ad, arq); + + list_del_init(&arq->fifo); + as_remove_merge_hints(q, arq); + as_del_arq_rb(ad, arq); } /* @@ -1292,7 +1290,7 @@ static void as_add_request(struct as_data *ad, struct as_rq *arq) arq->io_context = as_get_io_context(); - if (arq->io_context && arq->io_context->aic) { + if (arq->io_context) { atomic_inc(&arq->io_context->aic->nr_queued); as_update_iohist(arq->io_context->aic, arq->request); } diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index 633266ee8c87..13cc6073bb47 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -1360,7 +1360,7 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) { struct request *rq = NULL; struct request_list *rl = &q->rq; - struct io_context *ioc = get_io_context(); + struct io_context *ioc = get_io_context(gfp_mask); spin_lock_irq(q->queue_lock); if (rl->count[rw]+1 >= q->nr_requests) { @@ -1439,7 +1439,7 @@ static struct request *get_request_wait(request_queue_t *q, int rw) struct io_context *ioc; io_schedule(); - ioc = get_io_context(); + ioc = get_io_context(GFP_NOIO); ioc_set_batching(ioc); put_io_context(ioc); } @@ -2462,7 +2462,7 @@ void exit_io_context(void) * But weird things happen, so we disable local interrupts to ensure exclusive * access to *current. */ -struct io_context *get_io_context(void) +struct io_context *get_io_context(int gfp_flags) { struct task_struct *tsk = current; unsigned long flags; @@ -2482,8 +2482,9 @@ struct io_context *get_io_context(void) tsk->io_context = ret; } } + if (ret) + atomic_inc(&ret->refcount); local_irq_restore(flags); - atomic_inc(&ret->refcount); return ret; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 69178ca80d7d..2e7f92aa1dc2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -70,7 +70,7 @@ struct io_context { void put_io_context(struct io_context *ioc); void exit_io_context(void); -struct io_context *get_io_context(void); +struct io_context *get_io_context(int gfp_flags); void copy_io_context(struct io_context **pdst, struct io_context **psrc); void swap_io_context(struct io_context **ioc1, struct io_context **ioc2); -- cgit v1.2.3