diff options
| author | Andrew Morton <akpm@digeo.com> | 2003-02-05 16:56:31 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@home.transmeta.com> | 2003-02-05 16:56:31 -0800 |
| commit | 00c8e791cba1bb88db8a8fd73106c28fdbab5716 (patch) | |
| tree | d1242ea5923a8b6fba8b95c909b8670b260f261e | |
| parent | c5070032bb8db845535ac8c85d45884138a08a4f (diff) | |
[PATCH] self-unplugging request queues
The patch teaches a queue to unplug itself:
a) if is has four requests OR
b) if it has had plugged requests for 3 milliseconds.
These numbers may need to be tuned, although doing so doesn't seem to
make much difference. 10 msecs works OK, so HZ=100 machines will be
fine.
Instrumentation shows that about 5-10% of requests were started due to
the three millisecond timeout (during a kernel compile). That's
somewhat significant. It means that the kernel is leaving stuff in the
queue, plugged, for too long. This testing was with a uniprocessor
preemptible kernel, which is particularly vulnerable to unplug latency
(submit some IO, get preempted before the unplug).
This patch permits the removal of a lot of rather lame unplugging in
page reclaim and in the writeback code, which kicks the queues
(globally!) every four megabytes to get writeback underway.
This patch doesn't use blk_run_queues(). It is able to kick just the
particular queue.
The patch is not expected to make much difference really, except for
AIO. AIO needs a blk_run_queues() in its io_submit() call. For each
request. This means that AIO has to disable plugging altogether,
unless something like this patch does it for it. It means that AIO
will unplug *all* queues in the machine for every io_submit(). Even
against a socket!
This patch was tested by disabling blk_run_queues() completely. The
system ran OK.
The 3 milliseconds may be too long. It's OK for the heavy writeback
code, but AIO may want less. Or maybe AIO really wants zero (ie:
disable plugging). If that is so, we need new code paths by which AIO
can communicate the "immediate unplug" information - a global unplug is
not good.
To minimise unplug latency due to user CPU load, this patch gives keventd
`nice -10'. This is of course completely arbitrary. Really, I think keventd
should be SCHED_RR/MAX_RT_PRIO-1, as it has been in -aa kernels for ages.
| -rw-r--r-- | drivers/block/ll_rw_blk.c | 40 | ||||
| -rw-r--r-- | include/linux/blkdev.h | 10 | ||||
| -rw-r--r-- | kernel/workqueue.c | 1 |
3 files changed, 51 insertions, 0 deletions
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index f8a4e7a81f4b..e13d0bbca144 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -27,6 +27,8 @@ #include <linux/completion.h> #include <linux/slab.h> +static void blk_unplug_work(void *data); + /* * For the allocated request tables */ @@ -237,6 +239,14 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn) blk_queue_hardsect_size(q, 512); blk_queue_dma_alignment(q, 511); + q->unplug_thresh = 4; /* hmm */ + q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ + if (q->unplug_delay == 0) + q->unplug_delay = 1; + + init_timer(&q->unplug_timer); + INIT_WORK(&q->unplug_work, blk_unplug_work, q); + /* * by default assume old behaviour and bounce for any highmem page */ @@ -960,6 +970,7 @@ void blk_plug_device(request_queue_t *q) if (!blk_queue_plugged(q)) { spin_lock(&blk_plug_lock); list_add_tail(&q->plug_list, &blk_plug_list); + mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); spin_unlock(&blk_plug_lock); } } @@ -974,6 +985,7 @@ int blk_remove_plug(request_queue_t *q) if (blk_queue_plugged(q)) { spin_lock(&blk_plug_lock); list_del_init(&q->plug_list); + del_timer(&q->unplug_timer); spin_unlock(&blk_plug_lock); return 1; } @@ -992,6 +1004,8 @@ static inline void __generic_unplug_device(request_queue_t *q) if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) return; + del_timer(&q->unplug_timer); + /* * was plugged, fire request_fn if queue has stuff to do */ @@ -1020,6 +1034,18 @@ void generic_unplug_device(void *data) spin_unlock_irq(q->queue_lock); } +static void blk_unplug_work(void *data) +{ + generic_unplug_device(data); +} + +static void blk_unplug_timeout(unsigned long data) +{ + request_queue_t *q = (request_queue_t *)data; + + schedule_work(&q->unplug_work); +} + /** * blk_start_queue - restart a previously stopped queue * @q: The &request_queue_t in question @@ -1164,6 +1190,9 @@ void blk_cleanup_queue(request_queue_t * q) count -= __blk_cleanup_queue(&q->rq[READ]); count -= __blk_cleanup_queue(&q->rq[WRITE]); + del_timer_sync(&q->unplug_timer); + flush_scheduled_work(); + if (count) printk("blk_cleanup_queue: leaked requests (%d)\n", count); @@ -1269,6 +1298,9 @@ int blk_init_queue(request_queue_t *q, request_fn_proc *rfn, spinlock_t *lock) blk_queue_make_request(q, __make_request); blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE); + q->unplug_timer.function = blk_unplug_timeout; + q->unplug_timer.data = (unsigned long)q; + blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); @@ -1811,7 +1843,15 @@ get_rq: out: if (freereq) __blk_put_request(q, freereq); + + if (blk_queue_plugged(q)) { + int nr_queued = (queue_nr_requests - q->rq[0].count) + + (queue_nr_requests - q->rq[1].count); + if (nr_queued == q->unplug_thresh) + __generic_unplug_device(q); + } spin_unlock_irq(q->queue_lock); + return 0; end_io: diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c599ea36233b..82766b7e60b0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -4,6 +4,8 @@ #include <linux/major.h> #include <linux/genhd.h> #include <linux/list.h> +#include <linux/timer.h> +#include <linux/workqueue.h> #include <linux/pagemap.h> #include <linux/backing-dev.h> #include <linux/wait.h> @@ -188,6 +190,14 @@ struct request_queue unplug_fn *unplug_fn; merge_bvec_fn *merge_bvec_fn; + /* + * Auto-unplugging state + */ + struct timer_list unplug_timer; + int unplug_thresh; /* After this many requests */ + unsigned long unplug_delay; /* After this many jiffies */ + struct work_struct unplug_work; + struct backing_dev_info backing_dev_info; /* diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 156583c7dbf7..fb10d360c436 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -177,6 +177,7 @@ static int worker_thread(void *__startup) current->flags |= PF_IOTHREAD; cwq->thread = current; + set_user_nice(current, -10); set_cpus_allowed(current, 1UL << cpu); spin_lock_irq(¤t->sig->siglock); |
