[PATCH] block request batching

From: Nick Piggin <piggin@cyberone.com.au> The following patch gets batching working how it should be. After a process is woken up, it is allowed to allocate up to 32 requests for 20ms. It does not stop other processes submitting requests if it isn't submitting though. This should allow less context switches, and allow batches of requests from each process to be sent to the io scheduler instead of 1 request from each process. tiobench sequential writes are more than tripled, random writes are nearly doubled over mm1. In earlier tests I generally saw better CPU efficiency but it doesn't show here. There is still debug to be taken out. Its also only on UP. Avg Maximum Lat% Lat% CPU Identifier Rate (CPU%) Latency Latency >2s >10s Eff ------------------- ------ --------- ---------- ------- ------ ---- -2.5.71-mm1 11.13 3.783% 46.10 24668.01 0.84 0.02 294 +2.5.71-mm1 13.21 4.489% 37.37 5691.66 0.76 0.00 294 Random Reads ------------------- ------ --------- ---------- ------- ------ ---- -2.5.71-mm1 0.97 0.582% 519.86 6444.66 11.93 0.00 167 +2.5.71-mm1 1.01 0.604% 484.59 6604.93 10.73 0.00 167 Sequential Writes ------------------- ------ --------- ---------- ------- ------ ---- -2.5.71-mm1 4.85 4.456% 77.80 99359.39 0.18 0.13 109 +2.5.71-mm1 14.11 14.19% 10.07 22805.47 0.09 0.04 99 Random Writes ------------------- ------ --------- ---------- ------- ------ ---- -2.5.71-mm1 0.46 0.371% 14.48 6173.90 0.23 0.00 125 +2.5.71-mm1 0.86 0.744% 24.08 8753.66 0.31 0.00 115 It decreases context switch rate on IBM's 8-way on ext2 tiobench 64 threads from ~2500/s to ~140/s on their regression tests.
author: Andrew Morton <akpm@osdl.org> 2003-07-04 19:37:26 -0700
committer: Linus Torvalds <torvalds@home.osdl.org> 2003-07-04 19:37:26 -0700
commit: 930805a244eaadb5aefbc08b558db72136128388 (patch)
tree: 8b760be8c4fee468172cf15c95d702974533d140
parent: 16f88dbdbffa3dc52b959706e6a311a932b51ed6 (diff)
2 files changed, 94 insertions, 44 deletions
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 8f44b5690d9a..633266ee8c87 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -51,10 +51,11 @@ static struct workqueue_struct *kblockd_workqueue;
 
 unsigned long blk_max_low_pfn, blk_max_pfn;
 
-static inline int batch_requests(struct request_queue *q)
-{
-	return q->nr_requests - min(q->nr_requests / 8, 8UL) - 1;
-}
+/* Amount of time in which a process may batch requests */
+#define BLK_BATCH_TIME	(HZ/50UL)
+
+/* Number of requests a "batching" process may submit */
+#define BLK_BATCH_REQ	32
 
 /*
  * Return the threshold (number of used requests) at which the queue is
@@ -1305,24 +1306,76 @@ static inline struct request *blk_alloc_request(request_queue_t *q,int gfp_mask)
 	return NULL;
 }
 
+/*
+ * ioc_batching returns true if the ioc is a valid batching request and
+ * should be given priority access to a request.
+ */
+static inline int ioc_batching(struct io_context *ioc)
+{
+	if (!ioc)
+		return 0;
+
+	return ioc->nr_batch_requests == BLK_BATCH_REQ ||
+		(ioc->nr_batch_requests > 0
+		&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
+}
+
+/*
+ * ioc_set_batching sets ioc to be a new "batcher" if it is not one
+ */
+void ioc_set_batching(struct io_context *ioc)
+{
+	if (!ioc || ioc_batching(ioc))
+		return;
+
+	ioc->nr_batch_requests = BLK_BATCH_REQ;
+	ioc->last_waited = jiffies;
+}
+
+/*
+ * A request has just been released.  Account for it, update the full and
+ * congestion status, wake up any waiters.   Called under q->queue_lock.
+ */
+static void freed_request(request_queue_t *q, int rw)
+{
+	struct request_list *rl = &q->rq;
+
+	rl->count[rw]--;
+	if (rl->count[rw] < queue_congestion_off_threshold(q))
+		clear_queue_congested(q, rw);
+	if (rl->count[rw]+1 <= q->nr_requests) {
+		smp_mb();
+		if (waitqueue_active(&rl->wait[rw]))
+			wake_up(&rl->wait[rw]);
+		if (!waitqueue_active(&rl->wait[rw]))
+			blk_clear_queue_full(q, rw);
+	}
+}
+
 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
 /*
  * Get a free request, queue_lock must not be held
  */
-static struct request *
-get_request(request_queue_t *q, int rw, int gfp_mask, int force)
+static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
+	struct io_context *ioc = get_io_context();
 
 	spin_lock_irq(q->queue_lock);
-	if (rl->count[rw] == q->nr_requests)
-		blk_set_queue_full(q, rw);
+	if (rl->count[rw]+1 >= q->nr_requests) {
+		if (!blk_queue_full(q, rw)) {
+			ioc_set_batching(ioc);
+			blk_set_queue_full(q, rw);
+		}
+	}
 
-	if (blk_queue_full(q, rw) && !force && !elv_may_queue(q, rw)) {
+	if (blk_queue_full(q, rw)
+			&& !ioc_batching(ioc) && !elv_may_queue(q, rw)) {
 		spin_unlock_irq(q->queue_lock);
 		goto out;
 	}
+
 	rl->count[rw]++;
 	if (rl->count[rw] >= queue_congestion_on_threshold(q))
 		set_queue_congested(q, rw);
@@ -1331,20 +1384,13 @@ get_request(request_queue_t *q, int rw, int gfp_mask, int force)
 	rq = blk_alloc_request(q, gfp_mask);
 	if (!rq) {
 		spin_lock_irq(q->queue_lock);
-		rl->count[rw]--;
-		if (rl->count[rw] < queue_congestion_off_threshold(q))
-                        clear_queue_congested(q, rw);
-
-		if (rl->count[rw] <= batch_requests(q)) {
-			if (waitqueue_active(&rl->wait[rw]))
-				wake_up(&rl->wait[rw]);
-			else
-				blk_clear_queue_full(q, rw);
-		}
-
+		freed_request(q, rw);
 		spin_unlock_irq(q->queue_lock);
 		goto out;
 	}
+
+	if (ioc_batching(ioc))
+		ioc->nr_batch_requests--;
 	
 	INIT_LIST_HEAD(&rq->queuelist);
 
@@ -1367,6 +1413,7 @@ get_request(request_queue_t *q, int rw, int gfp_mask, int force)
 	rq->sense = NULL;
 
 out:
+	put_io_context(ioc);
 	return rq;
 }
 
@@ -1378,7 +1425,6 @@ static struct request *get_request_wait(request_queue_t *q, int rw)
 {
 	DEFINE_WAIT(wait);
 	struct request *rq;
-	int waited = 0;
 
 	generic_unplug_device(q);
 	do {
@@ -1387,11 +1433,15 @@ static struct request *get_request_wait(request_queue_t *q, int rw)
 		prepare_to_wait_exclusive(&rl->wait[rw], &wait,
 				TASK_UNINTERRUPTIBLE);
 
-		rq = get_request(q, rw, GFP_NOIO, waited);
+		rq = get_request(q, rw, GFP_NOIO);
 
 		if (!rq) {
+			struct io_context *ioc;
+
 			io_schedule();
-			waited = 1;
+			ioc = get_io_context();
+			ioc_set_batching(ioc);
+			put_io_context(ioc);
 		}
 		finish_wait(&rl->wait[rw], &wait);
 	} while (!rq);
@@ -1408,7 +1458,7 @@ struct request *blk_get_request(request_queue_t *q, int rw, int gfp_mask)
 	if (gfp_mask & __GFP_WAIT)
 		rq = get_request_wait(q, rw);
 	else
-		rq = get_request(q, rw, gfp_mask, 0);
+		rq = get_request(q, rw, gfp_mask);
 
 	return rq;
 }
@@ -1555,17 +1605,7 @@ void __blk_put_request(request_queue_t *q, struct request *req)
 		BUG_ON(!list_empty(&req->queuelist));
 
 		blk_free_request(q, req);
-
-		rl->count[rw]--;
-		if (rl->count[rw] < queue_congestion_off_threshold(q))
-			clear_queue_congested(q, rw);
-
-		if (rl->count[rw] <= batch_requests(q)) {
-			if (waitqueue_active(&rl->wait[rw]))
-				wake_up(&rl->wait[rw]);
-			else
-				blk_clear_queue_full(q, rw);
-		}
+		freed_request(q, rw);
 	}
 }
 
@@ -1808,7 +1848,7 @@ get_rq:
 		freereq = NULL;
 	} else {
 		spin_unlock_irq(q->queue_lock);
-		if ((freereq = get_request(q, rw, GFP_ATOMIC, 0)) == NULL) {
+		if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) {
 			/*
 			 * READA bit set
 			 */
@@ -1852,13 +1892,12 @@ out:
 		__blk_put_request(q, freereq);
 
 	if (blk_queue_plugged(q)) {
-		int nr_queued = q->rq.count[0] + q->rq.count[1];
+		int nr_queued = q->rq.count[READ] + q->rq.count[WRITE];
 
 		if (nr_queued == q->unplug_thresh)
 			__generic_unplug_device(q);
 	}
 	spin_unlock_irq(q->queue_lock);
-
 	return 0;
 
 end_io:
@@ -1866,7 +1905,6 @@ end_io:
 	return 0;
 }
 
-
 /*
  * If bio->bi_dev is a partition, remap the location
  */
@@ -2378,6 +2416,7 @@ int __init blk_dev_init(void)
 	return 0;
 }
 
+static atomic_t nr_io_contexts = ATOMIC_INIT(0);
 
 /*
  * IO Context helper functions
@@ -2393,6 +2432,7 @@ void put_io_context(struct io_context *ioc)
 		if (ioc->aic && ioc->aic->dtor)
 			ioc->aic->dtor(ioc->aic);
 		kfree(ioc);
+		atomic_dec(&nr_io_contexts);
 	}
 }
 
@@ -2409,7 +2449,8 @@ void exit_io_context(void)
 			ioc->aic->exit(ioc->aic);
 		put_io_context(ioc);
 		current->io_context = NULL;
-	}
+	} else
+		WARN_ON(1);
 	local_irq_restore(flags);
 }
 
@@ -2432,8 +2473,11 @@ struct io_context *get_io_context(void)
 	if (ret == NULL) {
 		ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
 		if (ret) {
+			atomic_inc(&nr_io_contexts);
 			atomic_set(&ret->refcount, 1);
 			ret->pid = tsk->pid;
+			ret->last_waited = jiffies; /* doesn't matter... */
+			ret->nr_batch_requests = 0; /* because this is 0 */
 			ret->aic = NULL;
 			tsk->io_context = ret;
 		}
@@ -2515,16 +2559,16 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
 
 	if (rl->count[READ] >= q->nr_requests) {
 		blk_set_queue_full(q, READ);
-	} else if (rl->count[READ] <= batch_requests(q)) {
+	} else if (rl->count[READ]+1 <= q->nr_requests) {
 		blk_clear_queue_full(q, READ);
-		wake_up_all(&rl->wait[READ]);
+		wake_up(&rl->wait[READ]);
 	}
 
 	if (rl->count[WRITE] >= q->nr_requests) {
 		blk_set_queue_full(q, WRITE);
-	} else if (rl->count[WRITE] <= batch_requests(q)) {
+	} else if (rl->count[WRITE]+1 <= q->nr_requests) {
 		blk_clear_queue_full(q, WRITE);
-		wake_up_all(&rl->wait[WRITE]);
+		wake_up(&rl->wait[WRITE]);
 	}
 	return ret;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 13116a7a7969..69178ca80d7d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -59,6 +59,12 @@ struct io_context {
 	atomic_t refcount;
 	pid_t pid;
 
+	/*
+	 * For request batching
+	 */
+	unsigned long last_waited; /* Time last woken after wait for request */
+	int nr_batch_requests;     /* Number of requests left in the batch */
+
 	struct as_io_context *aic;
 };
author	Andrew Morton <akpm@osdl.org>	2003-07-04 19:37:26 -0700
committer	Linus Torvalds <torvalds@home.osdl.org>	2003-07-04 19:37:26 -0700
commit	930805a244eaadb5aefbc08b558db72136128388 (patch)
tree	8b760be8c4fee468172cf15c95d702974533d140
parent	16f88dbdbffa3dc52b959706e6a311a932b51ed6 (diff)