From 98b069b9e50c88105ef9e7fa134554ae0cdcd044 Mon Sep 17 00:00:00 2001
From: Pete Zaitcev <zaitcev@redhat.com>
Date: Fri, 3 Sep 2004 17:25:14 +0200
Subject: [PATCH] USB: Fixes for ub in 2.4.9-rc1 from Oliver and Pat

- Set the allocation size in REQUEST SENSE (Pat LaVarre)
- Move add_timer invocations to safer places (Oliver Neukum)


Signed-off-by: Greg Kroah-Hartman <greg@kroah.com>
---
 drivers/block/ub.c | 59 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 27 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index f605535d3f56..6379b9043631 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -786,17 +786,16 @@ static int ub_scsi_cmd_start(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 	sc->work_urb.error_count = 0;
 	sc->work_urb.status = 0;
 
-	sc->work_timer.expires = jiffies + UB_URB_TIMEOUT;
-	add_timer(&sc->work_timer);
-
 	if ((rc = usb_submit_urb(&sc->work_urb, GFP_ATOMIC)) != 0) {
 		/* XXX Clear stalls */
 		printk("ub: cmd #%d start failed (%d)\n", cmd->tag, rc); /* P3 */
-		del_timer(&sc->work_timer);
 		ub_complete(&sc->work_done);
 		return rc;
 	}
 
+	sc->work_timer.expires = jiffies + UB_URB_TIMEOUT;
+	add_timer(&sc->work_timer);
+
 	cmd->state = UB_CMDST_CMD;
 	ub_cmdtr_state(sc, cmd);
 	return 0;
@@ -968,18 +967,17 @@ static void ub_scsi_urb_compl(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 		sc->work_urb.error_count = 0;
 		sc->work_urb.status = 0;
 
-		sc->work_timer.expires = jiffies + UB_URB_TIMEOUT;
-		add_timer(&sc->work_timer);
-
 		if ((rc = usb_submit_urb(&sc->work_urb, GFP_ATOMIC)) != 0) {
 			/* XXX Clear stalls */
 			printk("ub: data #%d submit failed (%d)\n", cmd->tag, rc); /* P3 */
-			del_timer(&sc->work_timer);
 			ub_complete(&sc->work_done);
 			ub_state_done(sc, cmd, rc);
 			return;
 		}
 
+		sc->work_timer.expires = jiffies + UB_URB_TIMEOUT;
+		add_timer(&sc->work_timer);
+
 		cmd->state = UB_CMDST_DATA;
 		ub_cmdtr_state(sc, cmd);
 
@@ -1063,19 +1061,18 @@ static void ub_scsi_urb_compl(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 			sc->work_urb.error_count = 0;
 			sc->work_urb.status = 0;
 
-			sc->work_timer.expires = jiffies + UB_URB_TIMEOUT;
-			add_timer(&sc->work_timer);
-
 			rc = usb_submit_urb(&sc->work_urb, GFP_ATOMIC);
 			if (rc != 0) {
 				/* XXX Clear stalls */
 				printk("%s: CSW #%d submit failed (%d)\n",
 				   sc->name, cmd->tag, rc); /* P3 */
-				del_timer(&sc->work_timer);
 				ub_complete(&sc->work_done);
 				ub_state_done(sc, cmd, rc);
 				return;
 			}
+
+			sc->work_timer.expires = jiffies + UB_URB_TIMEOUT;
+			add_timer(&sc->work_timer);
 			return;
 		}
 
@@ -1186,18 +1183,17 @@ static void ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 	sc->work_urb.error_count = 0;
 	sc->work_urb.status = 0;
 
-	sc->work_timer.expires = jiffies + UB_URB_TIMEOUT;
-	add_timer(&sc->work_timer);
-
 	if ((rc = usb_submit_urb(&sc->work_urb, GFP_ATOMIC)) != 0) {
 		/* XXX Clear stalls */
 		printk("ub: CSW #%d submit failed (%d)\n", cmd->tag, rc); /* P3 */
-		del_timer(&sc->work_timer);
 		ub_complete(&sc->work_done);
 		ub_state_done(sc, cmd, rc);
 		return;
 	}
 
+	sc->work_timer.expires = jiffies + UB_URB_TIMEOUT;
+	add_timer(&sc->work_timer);
+
 	cmd->stat_count = 0;
 	cmd->state = UB_CMDST_STAT;
 	ub_cmdtr_state(sc, cmd);
@@ -1217,9 +1213,17 @@ static void ub_state_sense(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 		goto error;
 	}
 
+	/*
+	 * ``If the allocation length is eighteen or greater, and a device
+	 * server returns less than eithteen bytes of data, the application
+	 * client should assume that the bytes not transferred would have been
+	 * zeroes had the device server returned those bytes.''
+	 */
 	memset(&sc->top_sense, 0, UB_SENSE_SIZE);
+
 	scmd = &sc->top_rqs_cmd;
 	scmd->cdb[0] = REQUEST_SENSE;
+	scmd->cdb[4] = UB_SENSE_SIZE;
 	scmd->cdb_len = 6;
 	scmd->dir = UB_DIR_READ;
 	scmd->state = UB_CMDST_INIT;
@@ -1271,14 +1275,13 @@ static int ub_submit_clear_stall(struct ub_dev *sc, struct ub_scsi_cmd *cmd,
 	sc->work_urb.error_count = 0;
 	sc->work_urb.status = 0;
 
-	sc->work_timer.expires = jiffies + UB_CTRL_TIMEOUT;
-	add_timer(&sc->work_timer);
-
 	if ((rc = usb_submit_urb(&sc->work_urb, GFP_ATOMIC)) != 0) {
-		del_timer(&sc->work_timer);
 		ub_complete(&sc->work_done);
 		return rc;
 	}
+
+	sc->work_timer.expires = jiffies + UB_CTRL_TIMEOUT;
+	add_timer(&sc->work_timer);
 	return 0;
 }
 
@@ -1289,6 +1292,9 @@ static void ub_top_sense_done(struct ub_dev *sc, struct ub_scsi_cmd *scmd)
 	unsigned char *sense = scmd->data;
 	struct ub_scsi_cmd *cmd;
 
+	/*
+	 * Ignoring scmd->act_len, because the buffer was pre-zeroed.
+	 */
 	ub_cmdtr_sense(sc, scmd, sense);
 
 	if ((cmd = ub_cmdq_peek(sc)) == NULL) {
@@ -1725,19 +1731,18 @@ static int ub_probe_clear_stall(struct ub_dev *sc, int stalled_pipe)
 	sc->work_urb.error_count = 0;
 	sc->work_urb.status = 0;
 
-	init_timer(&timer);
-	timer.function = ub_probe_timeout;
-	timer.data = (unsigned long) &compl;
-	timer.expires = jiffies + UB_CTRL_TIMEOUT;
-	add_timer(&timer);
-
 	if ((rc = usb_submit_urb(&sc->work_urb, GFP_KERNEL)) != 0) {
 		printk(KERN_WARNING
 		     "%s: Unable to submit a probe clear (%d)\n", sc->name, rc);
-		del_timer_sync(&timer);
 		return rc;
 	}
 
+	init_timer(&timer);
+	timer.function = ub_probe_timeout;
+	timer.data = (unsigned long) &compl;
+	timer.expires = jiffies + UB_CTRL_TIMEOUT;
+	add_timer(&timer);
+
 	wait_for_completion(&compl);
 
 	del_timer_sync(&timer);
-- 
cgit v1.2.3


From b51f163dd33b10311d61f2e2b55107d0a316cac5 Mon Sep 17 00:00:00 2001
From: Pete Zaitcev <zaitcev@redhat.com>
Date: Mon, 13 Sep 2004 21:56:06 -0700
Subject: [PATCH] USB: Patch for 3 ub bugs in 2.6.9-rc1-mm4

Actual users of ub quickly found problems, so here's a patch to address
some of them.

#1: An attempt to mount a CF card, pull the plug, then unmount causes a
message "getblk: bad sector size 512" and an oops. This is caused by
trying to do put_disk from disconnect instead of using a reference count.
The sd.c does it this way (it uses kref).

#2: The hald fills /var/log/messages with block device errors. It seems
that it happens because ub allowed opens of known offline devices, and
then partition checking produced those errors. I hope taking code from
sd.c should fix it.

Also I replaced usb_unlink_urb with usb_kill_urb.


Signed-off-by: Greg Kroah-Hartman <greg@kroah.com>
---
 drivers/block/ub.c | 43 +++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 6379b9043631..184b8a22e86f 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -490,6 +490,18 @@ static void ub_id_put(int id)
  */
 static void ub_cleanup(struct ub_dev *sc)
 {
+
+	/*
+	 * If we zero disk->private_data BEFORE put_disk, we have to check
+	 * for NULL all over the place in open, release, check_media and
+	 * revalidate, because the block level semaphore is well inside the
+	 * put_disk. But we cannot zero after the call, because *disk is gone.
+	 * The sd.c is blatantly racy in this area.
+	 */
+	/* disk->private_data = NULL; */
+	put_disk(sc->disk);
+	sc->disk = NULL;
+
 	ub_id_put(sc->id);
 	kfree(sc);
 }
@@ -1413,7 +1425,15 @@ static int ub_bd_open(struct inode *inode, struct file *filp)
 	if (sc->removable || sc->readonly)
 		check_disk_change(inode->i_bdev);
 
-	/* XXX sd.c and floppy.c bail on open if media is not present. */
+	/*
+	 * The sd.c considers ->media_present and ->changed not equivalent,
+	 * under some pretty murky conditions (a failure of READ CAPACITY).
+	 * We may need it one day.
+	 */
+	if (sc->removable && sc->changed && !(filp->f_flags & O_NDELAY)) {
+		rc = -ENOMEDIUM;
+		goto err_open;
+	}
 
 	if (sc->readonly && (filp->f_mode & FMODE_WRITE)) {
 		rc = -EROFS;
@@ -1498,8 +1518,11 @@ static int ub_bd_revalidate(struct gendisk *disk)
 	printk(KERN_INFO "%s: device %u capacity nsec %ld bsize %u\n",
 	    sc->name, sc->dev->devnum, sc->capacity.nsec, sc->capacity.bsize);
 
+	/* XXX Support sector size switching like in sr.c */
+	// blk_queue_hardsect_size(q, sc->capacity.bsize);
 	set_capacity(disk, sc->capacity.nsec);
 	// set_disk_ro(sdkp->disk, sc->readonly);
+
 	return 0;
 }
 
@@ -1746,12 +1769,7 @@ static int ub_probe_clear_stall(struct ub_dev *sc, int stalled_pipe)
 	wait_for_completion(&compl);
 
 	del_timer_sync(&timer);
-	/*
-	 * Most of the time, URB was done and dev set to NULL, and so
-	 * the unlink bounces out with ENODEV. We do not call usb_kill_urb
-	 * because we still think about a backport to 2.4.
-	 */
-	usb_unlink_urb(&sc->work_urb);
+	usb_kill_urb(&sc->work_urb);
 
 	/* reset the endpoint toggle */
 	usb_settoggle(sc->dev, endp, usb_pipeout(sc->last_pipe), 0);
@@ -2010,17 +2028,6 @@ static void ub_disconnect(struct usb_interface *intf)
 	if (q)
 		blk_cleanup_queue(q);
 
-	/*
-	 * If we zero disk->private_data BEFORE put_disk, we have to check
-	 * for NULL all over the place in open, release, check_media and
-	 * revalidate, because the block level semaphore is well inside the
-	 * put_disk. But we cannot zero after the call, because *disk is gone.
-	 * The sd.c is blatantly racy in this area.
-	 */
-	/* disk->private_data = NULL; */
-	put_disk(disk);
-	sc->disk = NULL;
-
 	/*
 	 * We really expect blk_cleanup_queue() to wait, so no amount
 	 * of paranoya is too much.
-- 
cgit v1.2.3


From 957f22f3569378cec8fc5cbae38ac676ed691264 Mon Sep 17 00:00:00 2001
From: Pete Zaitcev <zaitcev@redhat.com>
Date: Wed, 29 Sep 2004 00:51:35 -0700
Subject: [PATCH] USB: Fixes for ub in 2.4.9-rc2-mm2

- Do retries for a memory key which was handed out on Kernel Summit 04.
- Add missing del_timer.
- Add shifts for a 2KB block size device, from Pat LaVarre.

Signed-off-by: Pete Zaitcev <zaitcev@yahoo.com>
Signed-off-by: Greg Kroah-Hartman <greg@kroah.com>
---
 drivers/block/ub.c | 53 +++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 37 insertions(+), 16 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 184b8a22e86f..efaf8264c345 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -25,6 +25,7 @@
  *  -- prune comments, they are too volumnous
  *  -- Exterminate P3 printks
  *  -- Resove XXX's
+ *  -- Redo "benh's retries", perhaps have spin-up code to handle them. V:D=?
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -157,7 +158,8 @@ struct ub_scsi_cmd {
 	struct ub_scsi_cmd *next;
 
 	int error;			/* Return code - valid upon done */
-	int act_len;			/* Return size */
+	unsigned int act_len;		/* Return size */
+	unsigned char key, asc, ascq;	/* May be valid if error==-EIO */
 
 	int stat_count;			/* Retries getting status. */
 
@@ -673,9 +675,12 @@ static inline int ub_bd_rq_fn_1(request_queue_t *q)
 
 	/*
 	 * build the command
+	 *
+	 * The call to blk_queue_hardsect_size() guarantees that request
+	 * is aligned, but it is given in terms of 512 byte units, always.
 	 */
-	block = rq->sector;
-	nblks = rq->nr_sectors;
+	block = rq->sector >> sc->capacity.bshift;
+	nblks = rq->nr_sectors >> sc->capacity.bshift;
 
 	memset(cmd, 0, sizeof(struct ub_scsi_cmd));
 	cmd->cdb[0] = (ub_dir == UB_DIR_READ)? READ_10: WRITE_10;
@@ -690,7 +695,7 @@ static inline int ub_bd_rq_fn_1(request_queue_t *q)
 	cmd->dir = ub_dir;
 	cmd->state = UB_CMDST_INIT;
 	cmd->data = rq->buffer;
-	cmd->len = nblks * 512;
+	cmd->len = rq->nr_sectors * 512;
 	cmd->done = ub_rw_cmd_done;
 	cmd->back = rq;
 
@@ -837,6 +842,7 @@ static void ub_urb_complete(struct urb *urb, struct pt_regs *pt)
 {
 	struct ub_dev *sc = urb->context;
 
+	del_timer(&sc->work_timer);
 	ub_complete(&sc->work_done);
 	tasklet_schedule(&sc->tasklet);
 }
@@ -1141,16 +1147,8 @@ static void ub_scsi_urb_compl(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 		(*cmd->done)(sc, cmd);
 
 	} else if (cmd->state == UB_CMDST_SENSE) {
-		/* 
-		 * We do not look at sense, because even if there was no sense,
-		 * we get into UB_CMDST_SENSE from a STALL or CSW FAIL only.
-		 * We request sense because we want to clear CHECK CONDITION
-		 * on devices with delusions of SCSI, and not because we
-		 * are curious in any way about the sense itself.
-		 */
-		/* if ((cmd->top_sense[2] & 0x0F) == NO_SENSE) { foo } */
-
 		ub_state_done(sc, cmd, -EIO);
+
 	} else {
 		printk(KERN_WARNING "%s: "
 		    "wrong command state %d on device %u\n",
@@ -1309,6 +1307,10 @@ static void ub_top_sense_done(struct ub_dev *sc, struct ub_scsi_cmd *scmd)
 	 */
 	ub_cmdtr_sense(sc, scmd, sense);
 
+	/*
+	 * Find the command which triggered the unit attention or a check,
+	 * save the sense into it, and advance its state machine.
+	 */
 	if ((cmd = ub_cmdq_peek(sc)) == NULL) {
 		printk(KERN_WARNING "%s: sense done while idle\n", sc->name);
 		return;
@@ -1326,6 +1328,10 @@ static void ub_top_sense_done(struct ub_dev *sc, struct ub_scsi_cmd *scmd)
 		return;
 	}
 
+	cmd->key = sense[2] & 0x0F;
+	cmd->asc = sense[12];
+	cmd->ascq = sense[13];
+
 	ub_scsi_urb_compl(sc, cmd);
 }
 
@@ -1519,7 +1525,7 @@ static int ub_bd_revalidate(struct gendisk *disk)
 	    sc->name, sc->dev->devnum, sc->capacity.nsec, sc->capacity.bsize);
 
 	/* XXX Support sector size switching like in sr.c */
-	// blk_queue_hardsect_size(q, sc->capacity.bsize);
+	blk_queue_hardsect_size(disk->queue, sc->capacity.bsize);
 	set_capacity(disk, sc->capacity.nsec);
 	// set_disk_ro(sdkp->disk, sc->readonly);
 
@@ -1621,6 +1627,9 @@ static int ub_sync_tur(struct ub_dev *sc)
 
 	rc = cmd->error;
 
+	if (rc == -EIO && cmd->key != 0)	/* Retries for benh's key */
+		rc = cmd->key;
+
 err_submit:
 	kfree(cmd);
 err_alloc:
@@ -1836,6 +1845,7 @@ static int ub_probe(struct usb_interface *intf,
 	request_queue_t *q;
 	struct gendisk *disk;
 	int rc;
+	int i;
 
 	rc = -ENOMEM;
 	if ((sc = kmalloc(sizeof(struct ub_dev), GFP_KERNEL)) == NULL)
@@ -1902,7 +1912,11 @@ static int ub_probe(struct usb_interface *intf,
 	 * has to succeed, so we clear checks with an additional one here.
 	 * In any case it's not our business how revaliadation is implemented.
 	 */
-	ub_sync_tur(sc);
+	for (i = 0; i < 3; i++) {	/* Retries for benh's key */
+		if ((rc = ub_sync_tur(sc)) <= 0) break;
+		if (rc != 0x6) break;
+		msleep(10);
+	}
 
 	sc->removable = 1;		/* XXX Query this from the device */
 
@@ -1938,7 +1952,7 @@ static int ub_probe(struct usb_interface *intf,
 	blk_queue_max_phys_segments(q, UB_MAX_REQ_SG);
 	// blk_queue_segment_boundary(q, CARM_SG_BOUNDARY);
 	blk_queue_max_sectors(q, UB_MAX_SECTORS);
-	// blk_queue_hardsect_size(q, xxxxx);
+	blk_queue_hardsect_size(q, sc->capacity.bsize);
 
 	/*
 	 * This is a serious infraction, caused by a deficiency in the
@@ -2046,6 +2060,13 @@ static void ub_disconnect(struct usb_interface *intf)
 	}
 	spin_unlock_irqrestore(&sc->lock, flags);
 
+	/*
+	 * There is virtually no chance that other CPU runs times so long
+	 * after ub_urb_complete should have called del_timer, but only if HCD
+	 * didn't forget to deliver a callback on unlink.
+	 */
+	del_timer_sync(&sc->work_timer);
+
 	/*
 	 * At this point there must be no commands coming from anyone
 	 * and no URBs left in transit.
-- 
cgit v1.2.3


From 680a90e8bd0ebbd8e82b01abca713a652c665768 Mon Sep 17 00:00:00 2001
From: Pete Zaitcev <zaitcev@redhat.com>
Date: Mon, 4 Oct 2004 00:34:36 -0700
Subject: [PATCH] USB: fix oops with latest ub driver in -mm tree

On Fri, 1 Oct 2004 11:39:17 -0700
Greg KH <greg@kroah.com> wrote:

> Pete, any ideas?  Oh, it also happens on my UP laptop.
>[...]
> kernel BUG at kernel/timer.c:413!

I have a suspicion. Actually, it was pointed to me by a kind soul before,
but I forgot who he was, unfortunately. I'm not sure if this is the
problem, but please try it if you can. It should apply on top of "latest".
I really hate that word, but in this case I haven't got a version number.

Signed-off-by: Greg Kroah-Hartman <greg@kroah.com>
---
 drivers/block/ub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/block')

diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index efaf8264c345..0c3567159375 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -842,7 +842,6 @@ static void ub_urb_complete(struct urb *urb, struct pt_regs *pt)
 {
 	struct ub_dev *sc = urb->context;
 
-	del_timer(&sc->work_timer);
 	ub_complete(&sc->work_done);
 	tasklet_schedule(&sc->tasklet);
 }
@@ -853,6 +852,7 @@ static void ub_scsi_action(unsigned long _dev)
 	unsigned long flags;
 
 	spin_lock_irqsave(&sc->lock, flags);
+	del_timer(&sc->work_timer);
 	ub_scsi_dispatch(sc);
 	spin_unlock_irqrestore(&sc->lock, flags);
 }
-- 
cgit v1.2.3


From 0cbd0fa4ac47991d38f945cb2bc51923204a0917 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <greg@kroah.com>
Date: Fri, 15 Oct 2004 02:12:52 -0700
Subject: USB: add endian markups to the ub driver.

Signed-off-by: Greg Kroah-Hartman <greg@kroah.com>
---
 drivers/block/ub.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 0c3567159375..dd3be0a06219 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -63,9 +63,9 @@
 
 /* command block wrapper */
 struct bulk_cb_wrap {
-	u32	Signature;		/* contains 'USBC' */
+	__le32	Signature;		/* contains 'USBC' */
 	u32	Tag;			/* unique per command id */
-	u32	DataTransferLength;	/* size of data */
+	__le32	DataTransferLength;	/* size of data */
 	u8	Flags;			/* direction in bit 0 */
 	u8	Lun;			/* LUN normally 0 */
 	u8	Length;			/* of of the CDB */
@@ -79,9 +79,9 @@ struct bulk_cb_wrap {
 
 /* command status wrapper */
 struct bulk_cs_wrap {
-	u32	Signature;		/* should = 'USBS' */
+	__le32	Signature;		/* should = 'USBS' */
 	u32	Tag;			/* same as original command */
-	u32	Residue;		/* amount not transferred */
+	__le32	Residue;		/* amount not transferred */
 	u8	Status;			/* see below */
 };
 
@@ -1692,8 +1692,8 @@ static int ub_sync_read_cap(struct ub_dev *sc, struct ub_capacity *ret)
 	}
 
 	/* sd.c special-cases sector size of 0 to mean 512. Needed? Safe? */
-	nsec = be32_to_cpu(*(u32 *)p) + 1;
-	bsize = be32_to_cpu(*(u32 *)(p + 4));
+	nsec = be32_to_cpu(*(__be32 *)p) + 1;
+	bsize = be32_to_cpu(*(__be32 *)(p + 4));
 	switch (bsize) {
 	case 512:	shift = 0;	break;
 	case 1024:	shift = 1;	break;
-- 
cgit v1.2.3


From 2f8e2dc86c9876edca632e8ef2ab1f68d1b753f0 Mon Sep 17 00:00:00 2001
From: Peter Osterlund <petero2@telia.com>
Date: Mon, 18 Oct 2004 17:57:34 -0700
Subject: [PATCH] CDRW packet writing support

This patch implements CDRW packet writing as a kernel block device.  Usage
instructions are in the packet-writing.txt file.

A hint: If you don't want to wait for a complete disc format, you can
format just a part of the disc.  For example:

        cdrwtool -d /dev/hdc -m 10240

This will format 10240 blocks, ie 20MB.

Signed-off-by: Peter Osterlund <petero2@telia.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/cdrom/00-INDEX           |    2 +
 Documentation/cdrom/packet-writing.txt |   86 +
 drivers/block/Kconfig                  |   33 +
 drivers/block/Makefile                 |    1 +
 drivers/block/pktcdvd.c                | 2679 ++++++++++++++++++++++++++++++++
 drivers/cdrom/Makefile                 |    1 +
 drivers/ide/ide-cd.c                   |    6 +-
 drivers/scsi/sr.c                      |    6 +-
 fs/compat_ioctl.c                      |    1 +
 include/linux/cdrom.h                  |    1 +
 include/linux/compat_ioctl.h           |    2 +
 include/linux/pktcdvd.h                |  275 ++++
 12 files changed, 3088 insertions(+), 5 deletions(-)
 create mode 100644 Documentation/cdrom/packet-writing.txt
 create mode 100644 drivers/block/pktcdvd.c
 create mode 100644 include/linux/pktcdvd.h

(limited to 'drivers/block')

diff --git a/Documentation/cdrom/00-INDEX b/Documentation/cdrom/00-INDEX
index eae6896676f2..916dafe29d3f 100644
--- a/Documentation/cdrom/00-INDEX
+++ b/Documentation/cdrom/00-INDEX
@@ -22,6 +22,8 @@ mcdx
 	- info on improved Mitsumi CD-ROM driver.
 optcd
 	- info on the Optics Storage 8000 AT CD-ROM driver
+packet-writing.txt
+	- Info on the CDRW packet writing module
 sbpcd
 	- info on the SoundBlaster/Panasonic CD-ROM interface driver.
 sjcd
diff --git a/Documentation/cdrom/packet-writing.txt b/Documentation/cdrom/packet-writing.txt
new file mode 100644
index 000000000000..d34fcbca9f27
--- /dev/null
+++ b/Documentation/cdrom/packet-writing.txt
@@ -0,0 +1,86 @@
+Getting started quick
+---------------------
+
+- Select packet support in the block device section and UDF support in
+  the file system section.
+
+- Compile and install kernel and modules, reboot.
+
+- You need the udftools package (pktsetup, mkudffs, cdrwtool).
+  Download from http://sourceforge.net/projects/linux-udf/
+
+- Grab a new CD-RW disc and format it (assuming CD-RW is hdc, substitute
+  as appropriate):
+	# cdrwtool -d /dev/hdc -q
+
+- Setup your writer
+	# pktsetup dev_name /dev/hdc
+
+- Now you can mount /dev/pktcdvd/dev_name and copy files to it. Enjoy!
+	# mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime
+
+
+Packet writing for DVD-RW media
+-------------------------------
+
+DVD-RW discs can be written to much like CD-RW discs if they are in
+the so called "restricted overwrite" mode. To put a disc in restricted
+overwrite mode, run:
+
+	# dvd+rw-format /dev/hdc
+
+You can then use the disc the same way you would use a CD-RW disc:
+
+	# pktsetup dev_name /dev/hdc
+	# mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime
+
+
+Packet writing for DVD+RW media
+-------------------------------
+
+According to the DVD+RW specification, a drive supporting DVD+RW discs
+shall implement "true random writes with 2KB granularity", which means
+that it should be possible to put any filesystem with a block size >=
+2KB on such a disc. For example, it should be possible to do:
+
+	# mkudffs /dev/hdc
+	# mount /dev/hdc /cdrom -t udf -o rw,noatime
+
+However, some drives don't follow the specification and expect the
+host to perform aligned writes at 32KB boundaries. Other drives do
+follow the specification, but suffer bad performance problems if the
+writes are not 32KB aligned.
+
+Both problems can be solved by using the pktcdvd driver, which always
+generates aligned writes.
+
+	# pktsetup dev_name /dev/hdc
+	# mkudffs /dev/pktcdvd/dev_name
+	# mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime
+
+
+Notes
+-----
+
+- CD-RW media can usually not be overwritten more than about 1000
+  times, so to avoid unnecessary wear on the media, you should always
+  use the noatime mount option.
+
+- Defect management (ie automatic remapping of bad sectors) has not
+  been implemented yet, so you are likely to get at least some
+  filesystem corruption if the disc wears out.
+
+- Since the pktcdvd driver makes the disc appear as a regular block
+  device with a 2KB block size, you can put any filesystem you like on
+  the disc. For example, run:
+
+	# /sbin/mke2fs /dev/pktcdvd/dev_name
+
+  to create an ext2 filesystem on the disc.
+
+
+Links
+-----
+
+See http://fy.chalmers.se/~appro/linux/DVD+RW/ for more information
+about DVD writing.
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index a1d50242b8cd..6a43c807497d 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -356,6 +356,39 @@ config LBD
 	  your machine, or if you want to have a raid or loopback device
 	  bigger than 2TB.  Otherwise say N.
 
+config CDROM_PKTCDVD
+	tristate "Packet writing on CD/DVD media"
+	help
+	  If you have a CDROM drive that supports packet writing, say Y to
+	  include preliminary support. It should work with any MMC/Mt Fuji
+	  compliant ATAPI or SCSI drive, which is just about any newer CD
+	  writer.
+
+	  Currently only writing to CD-RW, DVD-RW and DVD+RW discs is possible.
+	  DVD-RW disks must be in restricted overwrite mode.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called pktcdvd.
+
+config CDROM_PKTCDVD_BUFFERS
+	int "Free buffers for data gathering"
+	depends on CDROM_PKTCDVD
+	default "8"
+	help
+	  This controls the maximum number of active concurrent packets. More
+	  concurrent packets can increase write performance, but also require
+	  more memory. Each concurrent packet will require approximately 64Kb
+	  of non-swappable kernel memory, memory which will be allocated at
+	  pktsetup time.
+
+config CDROM_PKTCDVD_WCACHE
+	bool "Enable write caching"
+	depends on CDROM_PKTCDVD
+	help
+	  If enabled, write caching will be set for the CD-R/W device. For now
+	  this option is dangerous unless the CD-RW media is known good, as we
+	  don't do deferred write error handling yet.
+
 source "drivers/s390/block/Kconfig"
 
 endmenu
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index c8fbbf14ce94..1cf09a1c065b 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_BLK_DEV_XD)	+= xd.o
 obj-$(CONFIG_BLK_CPQ_DA)	+= cpqarray.o
 obj-$(CONFIG_BLK_CPQ_CISS_DA)  += cciss.o
 obj-$(CONFIG_BLK_DEV_DAC960)	+= DAC960.o
+obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
 
 obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
 obj-$(CONFIG_BLK_DEV_NBD)	+= nbd.o
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
new file mode 100644
index 000000000000..fb80b6a91f84
--- /dev/null
+++ b/drivers/block/pktcdvd.c
@@ -0,0 +1,2679 @@
+/*
+ * Copyright (C) 2000 Jens Axboe <axboe@suse.de>
+ * Copyright (C) 2001-2004 Peter Osterlund <petero2@telia.com>
+ *
+ * May be copied or modified under the terms of the GNU General Public
+ * License.  See linux/COPYING for more information.
+ *
+ * Packet writing layer for ATAPI and SCSI CD-R, CD-RW, DVD-R, and
+ * DVD-RW devices (aka an exercise in block layer masturbation)
+ *
+ *
+ * TODO: (circa order of when I will fix it)
+ * - Only able to write on CD-RW media right now.
+ * - check host application code on media and set it in write page
+ * - interface for UDF <-> packet to negotiate a new location when a write
+ *   fails.
+ * - handle OPC, especially for -RW media
+ *
+ * Theory of operation:
+ *
+ * We use a custom make_request_fn function that forwards reads directly to
+ * the underlying CD device. Write requests are either attached directly to
+ * a live packet_data object, or simply stored sequentially in a list for
+ * later processing by the kcdrwd kernel thread. This driver doesn't use
+ * any elevator functionally as defined by the elevator_s struct, but the
+ * underlying CD device uses a standard elevator.
+ *
+ * This strategy makes it possible to do very late merging of IO requests.
+ * A new bio sent to pkt_make_request can be merged with a live packet_data
+ * object even if the object is in the data gathering state.
+ *
+ *************************************************************************/
+
+#define VERSION_CODE	"v0.2.0a 2004-07-14 Jens Axboe (axboe@suse.de) and petero2@telia.com"
+
+#include <linux/pktcdvd.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/miscdevice.h>
+#include <linux/suspend.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_ioctl.h>
+
+#include <asm/uaccess.h>
+
+#if PACKET_DEBUG
+#define DPRINTK(fmt, args...) printk(KERN_NOTICE fmt, ##args)
+#else
+#define DPRINTK(fmt, args...)
+#endif
+
+#if PACKET_DEBUG > 1
+#define VPRINTK(fmt, args...) printk(KERN_NOTICE fmt, ##args)
+#else
+#define VPRINTK(fmt, args...)
+#endif
+
+#define MAX_SPEED 0xffff
+
+#define ZONE(sector, pd) (((sector) + (pd)->offset) & ~((pd)->settings.size - 1))
+
+static struct pktcdvd_device *pkt_devs[MAX_WRITERS];
+static struct proc_dir_entry *pkt_proc;
+static int pkt_major;
+static struct semaphore ctl_mutex;	/* Serialize open/close/setup/teardown */
+static mempool_t *psd_pool;
+
+
+static void pkt_bio_finished(struct pktcdvd_device *pd)
+{
+	BUG_ON(atomic_read(&pd->cdrw.pending_bios) <= 0);
+	if (atomic_dec_and_test(&pd->cdrw.pending_bios)) {
+		VPRINTK("pktcdvd: queue empty\n");
+		atomic_set(&pd->iosched.attention, 1);
+		wake_up(&pd->wqueue);
+	}
+}
+
+static void pkt_bio_destructor(struct bio *bio)
+{
+	kfree(bio->bi_io_vec);
+	kfree(bio);
+}
+
+static struct bio *pkt_bio_alloc(int nr_iovecs)
+{
+	struct bio_vec *bvl = NULL;
+	struct bio *bio;
+
+	bio = kmalloc(sizeof(struct bio), GFP_KERNEL);
+	if (!bio)
+		goto no_bio;
+	bio_init(bio);
+
+	bvl = kmalloc(nr_iovecs * sizeof(struct bio_vec), GFP_KERNEL);
+	if (!bvl)
+		goto no_bvl;
+	memset(bvl, 0, nr_iovecs * sizeof(struct bio_vec));
+
+	bio->bi_max_vecs = nr_iovecs;
+	bio->bi_io_vec = bvl;
+	bio->bi_destructor = pkt_bio_destructor;
+
+	return bio;
+
+ no_bvl:
+	kfree(bio);
+ no_bio:
+	return NULL;
+}
+
+/*
+ * Allocate a packet_data struct
+ */
+static struct packet_data *pkt_alloc_packet_data(void)
+{
+	int i;
+	struct packet_data *pkt;
+
+	pkt = kmalloc(sizeof(struct packet_data), GFP_KERNEL);
+	if (!pkt)
+		goto no_pkt;
+	memset(pkt, 0, sizeof(struct packet_data));
+
+	pkt->w_bio = pkt_bio_alloc(PACKET_MAX_SIZE);
+	if (!pkt->w_bio)
+		goto no_bio;
+
+	for (i = 0; i < PAGES_PER_PACKET; i++) {
+		pkt->pages[i] = alloc_page(GFP_KERNEL);
+		if (!pkt->pages[i])
+			goto no_page;
+	}
+	for (i = 0; i < PAGES_PER_PACKET; i++)
+		clear_page(page_address(pkt->pages[i]));
+
+	spin_lock_init(&pkt->lock);
+
+	for (i = 0; i < PACKET_MAX_SIZE; i++) {
+		struct bio *bio = pkt_bio_alloc(1);
+		if (!bio)
+			goto no_rd_bio;
+		pkt->r_bios[i] = bio;
+	}
+
+	return pkt;
+
+no_rd_bio:
+	for (i = 0; i < PACKET_MAX_SIZE; i++) {
+		struct bio *bio = pkt->r_bios[i];
+		if (bio)
+			bio_put(bio);
+	}
+
+no_page:
+	for (i = 0; i < PAGES_PER_PACKET; i++)
+		if (pkt->pages[i])
+			__free_page(pkt->pages[i]);
+	bio_put(pkt->w_bio);
+no_bio:
+	kfree(pkt);
+no_pkt:
+	return NULL;
+}
+
+/*
+ * Free a packet_data struct
+ */
+static void pkt_free_packet_data(struct packet_data *pkt)
+{
+	int i;
+
+	for (i = 0; i < PACKET_MAX_SIZE; i++) {
+		struct bio *bio = pkt->r_bios[i];
+		if (bio)
+			bio_put(bio);
+	}
+	for (i = 0; i < PAGES_PER_PACKET; i++)
+		__free_page(pkt->pages[i]);
+	bio_put(pkt->w_bio);
+	kfree(pkt);
+}
+
+static void pkt_shrink_pktlist(struct pktcdvd_device *pd)
+{
+	struct packet_data *pkt, *next;
+
+	BUG_ON(!list_empty(&pd->cdrw.pkt_active_list));
+
+	list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_free_list, list) {
+		pkt_free_packet_data(pkt);
+	}
+}
+
+static int pkt_grow_pktlist(struct pktcdvd_device *pd, int nr_packets)
+{
+	struct packet_data *pkt;
+
+	INIT_LIST_HEAD(&pd->cdrw.pkt_free_list);
+	INIT_LIST_HEAD(&pd->cdrw.pkt_active_list);
+	spin_lock_init(&pd->cdrw.active_list_lock);
+	while (nr_packets > 0) {
+		pkt = pkt_alloc_packet_data();
+		if (!pkt) {
+			pkt_shrink_pktlist(pd);
+			return 0;
+		}
+		pkt->id = nr_packets;
+		pkt->pd = pd;
+		list_add(&pkt->list, &pd->cdrw.pkt_free_list);
+		nr_packets--;
+	}
+	return 1;
+}
+
+static void *pkt_rb_alloc(int gfp_mask, void *data)
+{
+	return kmalloc(sizeof(struct pkt_rb_node), gfp_mask);
+}
+
+static void pkt_rb_free(void *ptr, void *data)
+{
+	kfree(ptr);
+}
+
+static inline struct pkt_rb_node *pkt_rbtree_next(struct pkt_rb_node *node)
+{
+	struct rb_node *n = rb_next(&node->rb_node);
+	if (!n)
+		return NULL;
+	return rb_entry(n, struct pkt_rb_node, rb_node);
+}
+
+static inline void pkt_rbtree_erase(struct pktcdvd_device *pd, struct pkt_rb_node *node)
+{
+	rb_erase(&node->rb_node, &pd->bio_queue);
+	mempool_free(node, pd->rb_pool);
+	pd->bio_queue_size--;
+	BUG_ON(pd->bio_queue_size < 0);
+}
+
+/*
+ * Find the first node in the pd->bio_queue rb tree with a starting sector >= s.
+ */
+static struct pkt_rb_node *pkt_rbtree_find(struct pktcdvd_device *pd, sector_t s)
+{
+	struct rb_node *n = pd->bio_queue.rb_node;
+	struct rb_node *next;
+	struct pkt_rb_node *tmp;
+
+	if (!n) {
+		BUG_ON(pd->bio_queue_size > 0);
+		return NULL;
+	}
+
+	for (;;) {
+		tmp = rb_entry(n, struct pkt_rb_node, rb_node);
+		if (s <= tmp->bio->bi_sector)
+			next = n->rb_left;
+		else
+			next = n->rb_right;
+		if (!next)
+			break;
+		n = next;
+	}
+
+	if (s > tmp->bio->bi_sector) {
+		tmp = pkt_rbtree_next(tmp);
+		if (!tmp)
+			return NULL;
+	}
+	BUG_ON(s > tmp->bio->bi_sector);
+	return tmp;
+}
+
+/*
+ * Insert a node into the pd->bio_queue rb tree.
+ */
+static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *node)
+{
+	struct rb_node **p = &pd->bio_queue.rb_node;
+	struct rb_node *parent = NULL;
+	sector_t s = node->bio->bi_sector;
+	struct pkt_rb_node *tmp;
+
+	while (*p) {
+		parent = *p;
+		tmp = rb_entry(parent, struct pkt_rb_node, rb_node);
+		if (s < tmp->bio->bi_sector)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+	rb_link_node(&node->rb_node, parent, p);
+	rb_insert_color(&node->rb_node, &pd->bio_queue);
+	pd->bio_queue_size++;
+}
+
+/*
+ * Add a bio to a single linked list defined by its head and tail pointers.
+ */
+static inline void pkt_add_list_last(struct bio *bio, struct bio **list_head, struct bio **list_tail)
+{
+	bio->bi_next = NULL;
+	if (*list_tail) {
+		BUG_ON((*list_head) == NULL);
+		(*list_tail)->bi_next = bio;
+		(*list_tail) = bio;
+	} else {
+		BUG_ON((*list_head) != NULL);
+		(*list_head) = bio;
+		(*list_tail) = bio;
+	}
+}
+
+/*
+ * Remove and return the first bio from a single linked list defined by its
+ * head and tail pointers.
+ */
+static inline struct bio *pkt_get_list_first(struct bio **list_head, struct bio **list_tail)
+{
+	struct bio *bio;
+
+	if (*list_head == NULL)
+		return NULL;
+
+	bio = *list_head;
+	*list_head = bio->bi_next;
+	if (*list_head == NULL)
+		*list_tail = NULL;
+
+	bio->bi_next = NULL;
+	return bio;
+}
+
+/*
+ * Send a packet_command to the underlying block device and
+ * wait for completion.
+ */
+static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *cgc)
+{
+	char sense[SCSI_SENSE_BUFFERSIZE];
+	request_queue_t *q;
+	struct request *rq;
+	DECLARE_COMPLETION(wait);
+	int err = 0;
+
+	q = bdev_get_queue(pd->bdev);
+
+	rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? WRITE : READ,
+			     __GFP_WAIT);
+	rq->errors = 0;
+	rq->rq_disk = pd->bdev->bd_disk;
+	rq->bio = NULL;
+	rq->buffer = NULL;
+	rq->timeout = 60*HZ;
+	rq->data = cgc->buffer;
+	rq->data_len = cgc->buflen;
+	rq->sense = sense;
+	memset(sense, 0, sizeof(sense));
+	rq->sense_len = 0;
+	rq->flags |= REQ_BLOCK_PC | REQ_HARDBARRIER;
+	if (cgc->quiet)
+		rq->flags |= REQ_QUIET;
+	memcpy(rq->cmd, cgc->cmd, CDROM_PACKET_SIZE);
+	if (sizeof(rq->cmd) > CDROM_PACKET_SIZE)
+		memset(rq->cmd + CDROM_PACKET_SIZE, 0, sizeof(rq->cmd) - CDROM_PACKET_SIZE);
+
+	rq->ref_count++;
+	rq->flags |= REQ_NOMERGE;
+	rq->waiting = &wait;
+	elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1);
+	generic_unplug_device(q);
+	wait_for_completion(&wait);
+
+	if (rq->errors)
+		err = -EIO;
+
+	blk_put_request(rq);
+	return err;
+}
+
+/*
+ * A generic sense dump / resolve mechanism should be implemented across
+ * all ATAPI + SCSI devices.
+ */
+static void pkt_dump_sense(struct packet_command *cgc)
+{
+	static char *info[9] = { "No sense", "Recovered error", "Not ready",
+				 "Medium error", "Hardware error", "Illegal request",
+				 "Unit attention", "Data protect", "Blank check" };
+	int i;
+	struct request_sense *sense = cgc->sense;
+
+	printk("pktcdvd:");
+	for (i = 0; i < CDROM_PACKET_SIZE; i++)
+		printk(" %02x", cgc->cmd[i]);
+	printk(" - ");
+
+	if (sense == NULL) {
+		printk("no sense\n");
+		return;
+	}
+
+	printk("sense %02x.%02x.%02x", sense->sense_key, sense->asc, sense->ascq);
+
+	if (sense->sense_key > 8) {
+		printk(" (INVALID)\n");
+		return;
+	}
+
+	printk(" (%s)\n", info[sense->sense_key]);
+}
+
+/*
+ * flush the drive cache to media
+ */
+static int pkt_flush_cache(struct pktcdvd_device *pd)
+{
+	struct packet_command cgc;
+
+	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
+	cgc.cmd[0] = GPCMD_FLUSH_CACHE;
+	cgc.quiet = 1;
+
+	/*
+	 * the IMMED bit -- we default to not setting it, although that
+	 * would allow a much faster close, this is safer
+	 */
+#if 0
+	cgc.cmd[1] = 1 << 1;
+#endif
+	return pkt_generic_packet(pd, &cgc);
+}
+
+/*
+ * speed is given as the normal factor, e.g. 4 for 4x
+ */
+static int pkt_set_speed(struct pktcdvd_device *pd, unsigned write_speed, unsigned read_speed)
+{
+	struct packet_command cgc;
+	struct request_sense sense;
+	int ret;
+
+	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
+	cgc.sense = &sense;
+	cgc.cmd[0] = GPCMD_SET_SPEED;
+	cgc.cmd[2] = (read_speed >> 8) & 0xff;
+	cgc.cmd[3] = read_speed & 0xff;
+	cgc.cmd[4] = (write_speed >> 8) & 0xff;
+	cgc.cmd[5] = write_speed & 0xff;
+
+	if ((ret = pkt_generic_packet(pd, &cgc)))
+		pkt_dump_sense(&cgc);
+
+	return ret;
+}
+
+/*
+ * Queue a bio for processing by the low-level CD device. Must be called
+ * from process context.
+ */
+static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio, int high_prio_read)
+{
+	spin_lock(&pd->iosched.lock);
+	if (bio_data_dir(bio) == READ) {
+		pkt_add_list_last(bio, &pd->iosched.read_queue,
+				  &pd->iosched.read_queue_tail);
+		if (high_prio_read)
+			pd->iosched.high_prio_read = 1;
+	} else {
+		pkt_add_list_last(bio, &pd->iosched.write_queue,
+				  &pd->iosched.write_queue_tail);
+	}
+	spin_unlock(&pd->iosched.lock);
+
+	atomic_set(&pd->iosched.attention, 1);
+	wake_up(&pd->wqueue);
+}
+
+/*
+ * Process the queued read/write requests. This function handles special
+ * requirements for CDRW drives:
+ * - A cache flush command must be inserted before a read request if the
+ *   previous request was a write.
+ * - Switching between reading and writing is slow, so don't it more often
+ *   than necessary.
+ * - Set the read speed according to current usage pattern. When only reading
+ *   from the device, it's best to use the highest possible read speed, but
+ *   when switching often between reading and writing, it's better to have the
+ *   same read and write speeds.
+ * - Reads originating from user space should have higher priority than reads
+ *   originating from pkt_gather_data, because some process is usually waiting
+ *   on reads of the first kind.
+ */
+static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
+{
+	request_queue_t *q;
+
+	if (atomic_read(&pd->iosched.attention) == 0)
+		return;
+	atomic_set(&pd->iosched.attention, 0);
+
+	q = bdev_get_queue(pd->bdev);
+
+	for (;;) {
+		struct bio *bio;
+		int reads_queued, writes_queued, high_prio_read;
+
+		spin_lock(&pd->iosched.lock);
+		reads_queued = (pd->iosched.read_queue != NULL);
+		writes_queued = (pd->iosched.write_queue != NULL);
+		if (!reads_queued)
+			pd->iosched.high_prio_read = 0;
+		high_prio_read = pd->iosched.high_prio_read;
+		spin_unlock(&pd->iosched.lock);
+
+		if (!reads_queued && !writes_queued)
+			break;
+
+		if (pd->iosched.writing) {
+			if (high_prio_read || (!writes_queued && reads_queued)) {
+				if (atomic_read(&pd->cdrw.pending_bios) > 0) {
+					VPRINTK("pktcdvd: write, waiting\n");
+					break;
+				}
+				pkt_flush_cache(pd);
+				pd->iosched.writing = 0;
+			}
+		} else {
+			if (!reads_queued && writes_queued) {
+				if (atomic_read(&pd->cdrw.pending_bios) > 0) {
+					VPRINTK("pktcdvd: read, waiting\n");
+					break;
+				}
+				pd->iosched.writing = 1;
+			}
+		}
+
+		spin_lock(&pd->iosched.lock);
+		if (pd->iosched.writing) {
+			bio = pkt_get_list_first(&pd->iosched.write_queue,
+						 &pd->iosched.write_queue_tail);
+		} else {
+			bio = pkt_get_list_first(&pd->iosched.read_queue,
+						 &pd->iosched.read_queue_tail);
+		}
+		spin_unlock(&pd->iosched.lock);
+
+		if (!bio)
+			continue;
+
+		if (bio_data_dir(bio) == READ)
+			pd->iosched.successive_reads += bio->bi_size >> 10;
+		else
+			pd->iosched.successive_reads = 0;
+		if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) {
+			if (pd->read_speed == pd->write_speed) {
+				pd->read_speed = MAX_SPEED;
+				pkt_set_speed(pd, pd->write_speed, pd->read_speed);
+			}
+		} else {
+			if (pd->read_speed != pd->write_speed) {
+				pd->read_speed = pd->write_speed;
+				pkt_set_speed(pd, pd->write_speed, pd->read_speed);
+			}
+		}
+
+		atomic_inc(&pd->cdrw.pending_bios);
+		generic_make_request(bio);
+	}
+}
+
+/*
+ * Special care is needed if the underlying block device has a small
+ * max_phys_segments value.
+ */
+static int pkt_set_segment_merging(struct pktcdvd_device *pd, request_queue_t *q)
+{
+	if ((pd->settings.size << 9) / CD_FRAMESIZE <= q->max_phys_segments) {
+		/*
+		 * The cdrom device can handle one segment/frame
+		 */
+		clear_bit(PACKET_MERGE_SEGS, &pd->flags);
+		return 0;
+	} else if ((pd->settings.size << 9) / PAGE_SIZE <= q->max_phys_segments) {
+		/*
+		 * We can handle this case at the expense of some extra memory
+		 * copies during write operations
+		 */
+		set_bit(PACKET_MERGE_SEGS, &pd->flags);
+		return 0;
+	} else {
+		printk("pktcdvd: cdrom max_phys_segments too small\n");
+		return -EIO;
+	}
+}
+
+/*
+ * Copy CD_FRAMESIZE bytes from src_bio into a destination page
+ */
+static void pkt_copy_bio_data(struct bio *src_bio, int seg, int offs,
+			      struct page *dst_page, int dst_offs)
+{
+	unsigned int copy_size = CD_FRAMESIZE;
+
+	while (copy_size > 0) {
+		struct bio_vec *src_bvl = bio_iovec_idx(src_bio, seg);
+		void *vfrom = kmap_atomic(src_bvl->bv_page, KM_USER0) +
+			src_bvl->bv_offset + offs;
+		void *vto = page_address(dst_page) + dst_offs;
+		int len = min_t(int, copy_size, src_bvl->bv_len - offs);
+
+		BUG_ON(len < 0);
+		memcpy(vto, vfrom, len);
+		kunmap_atomic(src_bvl->bv_page, KM_USER0);
+
+		seg++;
+		offs = 0;
+		dst_offs += len;
+		copy_size -= len;
+	}
+}
+
+/*
+ * Copy all data for this packet to pkt->pages[], so that
+ * a) The number of required segments for the write bio is minimized, which
+ *    is necessary for some scsi controllers.
+ * b) The data can be used as cache to avoid read requests if we receive a
+ *    new write request for the same zone.
+ */
+static void pkt_make_local_copy(struct packet_data *pkt, struct page **pages, int *offsets)
+{
+	int f, p, offs;
+
+	/* Copy all data to pkt->pages[] */
+	p = 0;
+	offs = 0;
+	for (f = 0; f < pkt->frames; f++) {
+		if (pages[f] != pkt->pages[p]) {
+			void *vfrom = kmap_atomic(pages[f], KM_USER0) + offsets[f];
+			void *vto = page_address(pkt->pages[p]) + offs;
+			memcpy(vto, vfrom, CD_FRAMESIZE);
+			kunmap_atomic(pages[f], KM_USER0);
+			pages[f] = pkt->pages[p];
+			offsets[f] = offs;
+		} else {
+			BUG_ON(offsets[f] != offs);
+		}
+		offs += CD_FRAMESIZE;
+		if (offs >= PAGE_SIZE) {
+			BUG_ON(offs > PAGE_SIZE);
+			offs = 0;
+			p++;
+		}
+	}
+}
+
+static int pkt_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
+{
+	struct packet_data *pkt = bio->bi_private;
+	struct pktcdvd_device *pd = pkt->pd;
+	BUG_ON(!pd);
+
+	if (bio->bi_size)
+		return 1;
+
+	VPRINTK("pkt_end_io_read: bio=%p sec0=%llx sec=%llx err=%d\n", bio,
+		(unsigned long long)pkt->sector, (unsigned long long)bio->bi_sector, err);
+
+	if (err)
+		atomic_inc(&pkt->io_errors);
+	if (atomic_dec_and_test(&pkt->io_wait)) {
+		atomic_inc(&pkt->run_sm);
+		wake_up(&pd->wqueue);
+	}
+	pkt_bio_finished(pd);
+
+	return 0;
+}
+
+static int pkt_end_io_packet_write(struct bio *bio, unsigned int bytes_done, int err)
+{
+	struct packet_data *pkt = bio->bi_private;
+	struct pktcdvd_device *pd = pkt->pd;
+	BUG_ON(!pd);
+
+	if (bio->bi_size)
+		return 1;
+
+	VPRINTK("pkt_end_io_packet_write: id=%d, err=%d\n", pkt->id, err);
+
+	pd->stats.pkt_ended++;
+
+	pkt_bio_finished(pd);
+	atomic_dec(&pkt->io_wait);
+	atomic_inc(&pkt->run_sm);
+	wake_up(&pd->wqueue);
+	return 0;
+}
+
+/*
+ * Schedule reads for the holes in a packet
+ */
+static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
+{
+	int frames_read = 0;
+	struct bio *bio;
+	int f;
+	char written[PACKET_MAX_SIZE];
+
+	BUG_ON(!pkt->orig_bios);
+
+	atomic_set(&pkt->io_wait, 0);
+	atomic_set(&pkt->io_errors, 0);
+
+	if (pkt->cache_valid) {
+		VPRINTK("pkt_gather_data: zone %llx cached\n",
+			(unsigned long long)pkt->sector);
+		goto out_account;
+	}
+
+	/*
+	 * Figure out which frames we need to read before we can write.
+	 */
+	memset(written, 0, sizeof(written));
+	spin_lock(&pkt->lock);
+	for (bio = pkt->orig_bios; bio; bio = bio->bi_next) {
+		int first_frame = (bio->bi_sector - pkt->sector) / (CD_FRAMESIZE >> 9);
+		int num_frames = bio->bi_size / CD_FRAMESIZE;
+		BUG_ON(first_frame < 0);
+		BUG_ON(first_frame + num_frames > pkt->frames);
+		for (f = first_frame; f < first_frame + num_frames; f++)
+			written[f] = 1;
+	}
+	spin_unlock(&pkt->lock);
+
+	/*
+	 * Schedule reads for missing parts of the packet.
+	 */
+	for (f = 0; f < pkt->frames; f++) {
+		int p, offset;
+		if (written[f])
+			continue;
+		bio = pkt->r_bios[f];
+		bio_init(bio);
+		bio->bi_max_vecs = 1;
+		bio->bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9);
+		bio->bi_bdev = pd->bdev;
+		bio->bi_end_io = pkt_end_io_read;
+		bio->bi_private = pkt;
+
+		p = (f * CD_FRAMESIZE) / PAGE_SIZE;
+		offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
+		VPRINTK("pkt_gather_data: Adding frame %d, page:%p offs:%d\n",
+			f, pkt->pages[p], offset);
+		if (!bio_add_page(bio, pkt->pages[p], CD_FRAMESIZE, offset))
+			BUG();
+
+		atomic_inc(&pkt->io_wait);
+		bio->bi_rw = READ;
+		pkt_queue_bio(pd, bio, 0);
+		frames_read++;
+	}
+
+out_account:
+	VPRINTK("pkt_gather_data: need %d frames for zone %llx\n",
+		frames_read, (unsigned long long)pkt->sector);
+	pd->stats.pkt_started++;
+	pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9);
+	pd->stats.secs_w += pd->settings.size;
+}
+
+/*
+ * Find a packet matching zone, or the least recently used packet if
+ * there is no match.
+ */
+static struct packet_data *pkt_get_packet_data(struct pktcdvd_device *pd, int zone)
+{
+	struct packet_data *pkt;
+
+	list_for_each_entry(pkt, &pd->cdrw.pkt_free_list, list) {
+		if (pkt->sector == zone || pkt->list.next == &pd->cdrw.pkt_free_list) {
+			list_del_init(&pkt->list);
+			if (pkt->sector != zone)
+				pkt->cache_valid = 0;
+			break;
+		}
+	}
+	return pkt;
+}
+
+static void pkt_put_packet_data(struct pktcdvd_device *pd, struct packet_data *pkt)
+{
+	if (pkt->cache_valid) {
+		list_add(&pkt->list, &pd->cdrw.pkt_free_list);
+	} else {
+		list_add_tail(&pkt->list, &pd->cdrw.pkt_free_list);
+	}
+}
+
+/*
+ * recover a failed write, query for relocation if possible
+ *
+ * returns 1 if recovery is possible, or 0 if not
+ *
+ */
+static int pkt_start_recovery(struct packet_data *pkt)
+{
+	/*
+	 * FIXME. We need help from the file system to implement
+	 * recovery handling.
+	 */
+	return 0;
+#if 0
+	struct request *rq = pkt->rq;
+	struct pktcdvd_device *pd = rq->rq_disk->private_data;
+	struct block_device *pkt_bdev;
+	struct super_block *sb = NULL;
+	unsigned long old_block, new_block;
+	sector_t new_sector;
+
+	pkt_bdev = bdget(kdev_t_to_nr(pd->pkt_dev));
+	if (pkt_bdev) {
+		sb = get_super(pkt_bdev);
+		bdput(pkt_bdev);
+	}
+
+	if (!sb)
+		return 0;
+
+	if (!sb->s_op || !sb->s_op->relocate_blocks)
+		goto out;
+
+	old_block = pkt->sector / (CD_FRAMESIZE >> 9);
+	if (sb->s_op->relocate_blocks(sb, old_block, &new_block))
+		goto out;
+
+	new_sector = new_block * (CD_FRAMESIZE >> 9);
+	pkt->sector = new_sector;
+
+	pkt->bio->bi_sector = new_sector;
+	pkt->bio->bi_next = NULL;
+	pkt->bio->bi_flags = 1 << BIO_UPTODATE;
+	pkt->bio->bi_idx = 0;
+
+	BUG_ON(pkt->bio->bi_rw != (1 << BIO_RW));
+	BUG_ON(pkt->bio->bi_vcnt != pkt->frames);
+	BUG_ON(pkt->bio->bi_size != pkt->frames * CD_FRAMESIZE);
+	BUG_ON(pkt->bio->bi_end_io != pkt_end_io_packet_write);
+	BUG_ON(pkt->bio->bi_private != pkt);
+
+	drop_super(sb);
+	return 1;
+
+out:
+	drop_super(sb);
+	return 0;
+#endif
+}
+
+static inline void pkt_set_state(struct packet_data *pkt, enum packet_data_state state)
+{
+#if PACKET_DEBUG > 1
+	static const char *state_name[] = {
+		"IDLE", "WAITING", "READ_WAIT", "WRITE_WAIT", "RECOVERY", "FINISHED"
+	};
+	enum packet_data_state old_state = pkt->state;
+	VPRINTK("pkt %2d : s=%6llx %s -> %s\n", pkt->id, (unsigned long long)pkt->sector,
+		state_name[old_state], state_name[state]);
+#endif
+	pkt->state = state;
+}
+
+/*
+ * Scan the work queue to see if we can start a new packet.
+ * returns non-zero if any work was done.
+ */
+static int pkt_handle_queue(struct pktcdvd_device *pd)
+{
+	struct packet_data *pkt, *p;
+	struct bio *bio = NULL;
+	sector_t zone = 0; /* Suppress gcc warning */
+	struct pkt_rb_node *node, *first_node;
+	struct rb_node *n;
+
+	VPRINTK("handle_queue\n");
+
+	atomic_set(&pd->scan_queue, 0);
+
+	if (list_empty(&pd->cdrw.pkt_free_list)) {
+		VPRINTK("handle_queue: no pkt\n");
+		return 0;
+	}
+
+	/*
+	 * Try to find a zone we are not already working on.
+	 */
+	spin_lock(&pd->lock);
+	first_node = pkt_rbtree_find(pd, pd->current_sector);
+	if (!first_node) {
+		n = rb_first(&pd->bio_queue);
+		if (n)
+			first_node = rb_entry(n, struct pkt_rb_node, rb_node);
+	}
+	node = first_node;
+	while (node) {
+		bio = node->bio;
+		zone = ZONE(bio->bi_sector, pd);
+		list_for_each_entry(p, &pd->cdrw.pkt_active_list, list) {
+			if (p->sector == zone)
+				goto try_next_bio;
+		}
+		break;
+try_next_bio:
+		node = pkt_rbtree_next(node);
+		if (!node) {
+			n = rb_first(&pd->bio_queue);
+			if (n)
+				node = rb_entry(n, struct pkt_rb_node, rb_node);
+		}
+		if (node == first_node)
+			node = NULL;
+	}
+	spin_unlock(&pd->lock);
+	if (!bio) {
+		VPRINTK("handle_queue: no bio\n");
+		return 0;
+	}
+
+	pkt = pkt_get_packet_data(pd, zone);
+	BUG_ON(!pkt);
+
+	pd->current_sector = zone + pd->settings.size;
+	pkt->sector = zone;
+	pkt->frames = pd->settings.size >> 2;
+	BUG_ON(pkt->frames > PACKET_MAX_SIZE);
+	pkt->write_size = 0;
+
+	/*
+	 * Scan work queue for bios in the same zone and link them
+	 * to this packet.
+	 */
+	spin_lock(&pd->lock);
+	VPRINTK("pkt_handle_queue: looking for zone %llx\n", (unsigned long long)zone);
+	while ((node = pkt_rbtree_find(pd, zone)) != NULL) {
+		bio = node->bio;
+		VPRINTK("pkt_handle_queue: found zone=%llx\n",
+			(unsigned long long)ZONE(bio->bi_sector, pd));
+		if (ZONE(bio->bi_sector, pd) != zone)
+			break;
+		pkt_rbtree_erase(pd, node);
+		spin_lock(&pkt->lock);
+		pkt_add_list_last(bio, &pkt->orig_bios, &pkt->orig_bios_tail);
+		pkt->write_size += bio->bi_size / CD_FRAMESIZE;
+		spin_unlock(&pkt->lock);
+	}
+	spin_unlock(&pd->lock);
+
+	pkt->sleep_time = max(PACKET_WAIT_TIME, 1);
+	pkt_set_state(pkt, PACKET_WAITING_STATE);
+	atomic_set(&pkt->run_sm, 1);
+
+	spin_lock(&pd->cdrw.active_list_lock);
+	list_add(&pkt->list, &pd->cdrw.pkt_active_list);
+	spin_unlock(&pd->cdrw.active_list_lock);
+
+	return 1;
+}
+
+/*
+ * Assemble a bio to write one packet and queue the bio for processing
+ * by the underlying block device.
+ */
+static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
+{
+	struct bio *bio;
+	struct page *pages[PACKET_MAX_SIZE];
+	int offsets[PACKET_MAX_SIZE];
+	int f;
+	int frames_write;
+
+	for (f = 0; f < pkt->frames; f++) {
+		pages[f] = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE];
+		offsets[f] = (f * CD_FRAMESIZE) % PAGE_SIZE;
+	}
+
+	/*
+	 * Fill-in pages[] and offsets[] with data from orig_bios.
+	 */
+	frames_write = 0;
+	spin_lock(&pkt->lock);
+	for (bio = pkt->orig_bios; bio; bio = bio->bi_next) {
+		int segment = bio->bi_idx;
+		int src_offs = 0;
+		int first_frame = (bio->bi_sector - pkt->sector) / (CD_FRAMESIZE >> 9);
+		int num_frames = bio->bi_size / CD_FRAMESIZE;
+		BUG_ON(first_frame < 0);
+		BUG_ON(first_frame + num_frames > pkt->frames);
+		for (f = first_frame; f < first_frame + num_frames; f++) {
+			struct bio_vec *src_bvl = bio_iovec_idx(bio, segment);
+
+			while (src_offs >= src_bvl->bv_len) {
+				src_offs -= src_bvl->bv_len;
+				segment++;
+				BUG_ON(segment >= bio->bi_vcnt);
+				src_bvl = bio_iovec_idx(bio, segment);
+			}
+
+			if (src_bvl->bv_len - src_offs >= CD_FRAMESIZE) {
+				pages[f] = src_bvl->bv_page;
+				offsets[f] = src_bvl->bv_offset + src_offs;
+			} else {
+				pkt_copy_bio_data(bio, segment, src_offs,
+						  pages[f], offsets[f]);
+			}
+			src_offs += CD_FRAMESIZE;
+			frames_write++;
+		}
+	}
+	pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE);
+	spin_unlock(&pkt->lock);
+
+	VPRINTK("pkt_start_write: Writing %d frames for zone %llx\n",
+		frames_write, (unsigned long long)pkt->sector);
+	BUG_ON(frames_write != pkt->write_size);
+
+	if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) {
+		pkt_make_local_copy(pkt, pages, offsets);
+		pkt->cache_valid = 1;
+	} else {
+		pkt->cache_valid = 0;
+	}
+
+	/* Start the write request */
+	bio_init(pkt->w_bio);
+	pkt->w_bio->bi_max_vecs = PACKET_MAX_SIZE;
+	pkt->w_bio->bi_sector = pkt->sector;
+	pkt->w_bio->bi_bdev = pd->bdev;
+	pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
+	pkt->w_bio->bi_private = pkt;
+	for (f = 0; f < pkt->frames; f++) {
+		if ((f + 1 < pkt->frames) && (pages[f + 1] == pages[f]) &&
+		    (offsets[f + 1] = offsets[f] + CD_FRAMESIZE)) {
+			if (!bio_add_page(pkt->w_bio, pages[f], CD_FRAMESIZE * 2, offsets[f]))
+				BUG();
+			f++;
+		} else {
+			if (!bio_add_page(pkt->w_bio, pages[f], CD_FRAMESIZE, offsets[f]))
+				BUG();
+		}
+	}
+	VPRINTK("pktcdvd: vcnt=%d\n", pkt->w_bio->bi_vcnt);
+
+	atomic_set(&pkt->io_wait, 1);
+	pkt->w_bio->bi_rw = WRITE;
+	pkt_queue_bio(pd, pkt->w_bio, 0);
+}
+
+static void pkt_finish_packet(struct packet_data *pkt, int uptodate)
+{
+	struct bio *bio, *next;
+
+	if (!uptodate)
+		pkt->cache_valid = 0;
+
+	/* Finish all bios corresponding to this packet */
+	bio = pkt->orig_bios;
+	while (bio) {
+		next = bio->bi_next;
+		bio->bi_next = NULL;
+		bio_endio(bio, bio->bi_size, uptodate ? 0 : -EIO);
+		bio = next;
+	}
+	pkt->orig_bios = pkt->orig_bios_tail = NULL;
+}
+
+static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data *pkt)
+{
+	int uptodate;
+
+	VPRINTK("run_state_machine: pkt %d\n", pkt->id);
+
+	for (;;) {
+		switch (pkt->state) {
+		case PACKET_WAITING_STATE:
+			if ((pkt->write_size < pkt->frames) && (pkt->sleep_time > 0))
+				return;
+
+			pkt->sleep_time = 0;
+			pkt_gather_data(pd, pkt);
+			pkt_set_state(pkt, PACKET_READ_WAIT_STATE);
+			break;
+
+		case PACKET_READ_WAIT_STATE:
+			if (atomic_read(&pkt->io_wait) > 0)
+				return;
+
+			if (atomic_read(&pkt->io_errors) > 0) {
+				pkt_set_state(pkt, PACKET_RECOVERY_STATE);
+			} else {
+				pkt_start_write(pd, pkt);
+			}
+			break;
+
+		case PACKET_WRITE_WAIT_STATE:
+			if (atomic_read(&pkt->io_wait) > 0)
+				return;
+
+			if (test_bit(BIO_UPTODATE, &pkt->w_bio->bi_flags)) {
+				pkt_set_state(pkt, PACKET_FINISHED_STATE);
+			} else {
+				pkt_set_state(pkt, PACKET_RECOVERY_STATE);
+			}
+			break;
+
+		case PACKET_RECOVERY_STATE:
+			if (pkt_start_recovery(pkt)) {
+				pkt_start_write(pd, pkt);
+			} else {
+				VPRINTK("No recovery possible\n");
+				pkt_set_state(pkt, PACKET_FINISHED_STATE);
+			}
+			break;
+
+		case PACKET_FINISHED_STATE:
+			uptodate = test_bit(BIO_UPTODATE, &pkt->w_bio->bi_flags);
+			pkt_finish_packet(pkt, uptodate);
+			return;
+
+		default:
+			BUG();
+			break;
+		}
+	}
+}
+
+static void pkt_handle_packets(struct pktcdvd_device *pd)
+{
+	struct packet_data *pkt, *next;
+
+	VPRINTK("pkt_handle_packets\n");
+
+	/*
+	 * Run state machine for active packets
+	 */
+	list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+		if (atomic_read(&pkt->run_sm) > 0) {
+			atomic_set(&pkt->run_sm, 0);
+			pkt_run_state_machine(pd, pkt);
+		}
+	}
+
+	/*
+	 * Move no longer active packets to the free list
+	 */
+	spin_lock(&pd->cdrw.active_list_lock);
+	list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_active_list, list) {
+		if (pkt->state == PACKET_FINISHED_STATE) {
+			list_del(&pkt->list);
+			pkt_put_packet_data(pd, pkt);
+			pkt_set_state(pkt, PACKET_IDLE_STATE);
+			atomic_set(&pd->scan_queue, 1);
+		}
+	}
+	spin_unlock(&pd->cdrw.active_list_lock);
+}
+
+static void pkt_count_states(struct pktcdvd_device *pd, int *states)
+{
+	struct packet_data *pkt;
+	int i;
+
+	for (i = 0; i <= PACKET_NUM_STATES; i++)
+		states[i] = 0;
+
+	spin_lock(&pd->cdrw.active_list_lock);
+	list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+		states[pkt->state]++;
+	}
+	spin_unlock(&pd->cdrw.active_list_lock);
+}
+
+/*
+ * kcdrwd is woken up when writes have been queued for one of our
+ * registered devices
+ */
+static int kcdrwd(void *foobar)
+{
+	struct pktcdvd_device *pd = foobar;
+	struct packet_data *pkt;
+	long min_sleep_time, residue;
+
+	set_user_nice(current, -20);
+
+	for (;;) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		/*
+		 * Wait until there is something to do
+		 */
+		add_wait_queue(&pd->wqueue, &wait);
+		for (;;) {
+			set_current_state(TASK_INTERRUPTIBLE);
+
+			/* Check if we need to run pkt_handle_queue */
+			if (atomic_read(&pd->scan_queue) > 0)
+				goto work_to_do;
+
+			/* Check if we need to run the state machine for some packet */
+			list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+				if (atomic_read(&pkt->run_sm) > 0)
+					goto work_to_do;
+			}
+
+			/* Check if we need to process the iosched queues */
+			if (atomic_read(&pd->iosched.attention) != 0)
+				goto work_to_do;
+
+			/* Otherwise, go to sleep */
+			if (PACKET_DEBUG > 1) {
+				int states[PACKET_NUM_STATES];
+				pkt_count_states(pd, states);
+				VPRINTK("kcdrwd: i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n",
+					states[0], states[1], states[2], states[3],
+					states[4], states[5]);
+			}
+
+			min_sleep_time = MAX_SCHEDULE_TIMEOUT;
+			list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+				if (pkt->sleep_time && pkt->sleep_time < min_sleep_time)
+					min_sleep_time = pkt->sleep_time;
+			}
+
+			generic_unplug_device(bdev_get_queue(pd->bdev));
+
+			VPRINTK("kcdrwd: sleeping\n");
+			residue = schedule_timeout(min_sleep_time);
+			VPRINTK("kcdrwd: wake up\n");
+
+			/* make swsusp happy with our thread */
+			if (current->flags & PF_FREEZE)
+				refrigerator(PF_FREEZE);
+
+			list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+				if (!pkt->sleep_time)
+					continue;
+				pkt->sleep_time -= min_sleep_time - residue;
+				if (pkt->sleep_time <= 0) {
+					pkt->sleep_time = 0;
+					atomic_inc(&pkt->run_sm);
+				}
+			}
+
+			if (signal_pending(current)) {
+				flush_signals(current);
+			}
+			if (kthread_should_stop())
+				break;
+		}
+work_to_do:
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&pd->wqueue, &wait);
+
+		if (kthread_should_stop())
+			break;
+
+		/*
+		 * if pkt_handle_queue returns true, we can queue
+		 * another request.
+		 */
+		while (pkt_handle_queue(pd))
+			;
+
+		/*
+		 * Handle packet state machine
+		 */
+		pkt_handle_packets(pd);
+
+		/*
+		 * Handle iosched queues
+		 */
+		pkt_iosched_process_queue(pd);
+	}
+
+	return 0;
+}
+
+static void pkt_print_settings(struct pktcdvd_device *pd)
+{
+	printk("pktcdvd: %s packets, ", pd->settings.fp ? "Fixed" : "Variable");
+	printk("%u blocks, ", pd->settings.size >> 2);
+	printk("Mode-%c disc\n", pd->settings.block_mode == 8 ? '1' : '2');
+}
+
+static int pkt_mode_sense(struct pktcdvd_device *pd, struct packet_command *cgc,
+			  int page_code, int page_control)
+{
+	memset(cgc->cmd, 0, sizeof(cgc->cmd));
+
+	cgc->cmd[0] = GPCMD_MODE_SENSE_10;
+	cgc->cmd[2] = page_code | (page_control << 6);
+	cgc->cmd[7] = cgc->buflen >> 8;
+	cgc->cmd[8] = cgc->buflen & 0xff;
+	cgc->data_direction = CGC_DATA_READ;
+	return pkt_generic_packet(pd, cgc);
+}
+
+static int pkt_mode_select(struct pktcdvd_device *pd, struct packet_command *cgc)
+{
+	memset(cgc->cmd, 0, sizeof(cgc->cmd));
+	memset(cgc->buffer, 0, 2);
+	cgc->cmd[0] = GPCMD_MODE_SELECT_10;
+	cgc->cmd[1] = 0x10;		/* PF */
+	cgc->cmd[7] = cgc->buflen >> 8;
+	cgc->cmd[8] = cgc->buflen & 0xff;
+	cgc->data_direction = CGC_DATA_WRITE;
+	return pkt_generic_packet(pd, cgc);
+}
+
+static int pkt_get_disc_info(struct pktcdvd_device *pd, disc_information *di)
+{
+	struct packet_command cgc;
+	int ret;
+
+	/* set up command and get the disc info */
+	init_cdrom_command(&cgc, di, sizeof(*di), CGC_DATA_READ);
+	cgc.cmd[0] = GPCMD_READ_DISC_INFO;
+	cgc.cmd[8] = cgc.buflen = 2;
+	cgc.quiet = 1;
+
+	if ((ret = pkt_generic_packet(pd, &cgc)))
+		return ret;
+
+	/* not all drives have the same disc_info length, so requeue
+	 * packet with the length the drive tells us it can supply
+	 */
+	cgc.buflen = be16_to_cpu(di->disc_information_length) +
+		     sizeof(di->disc_information_length);
+
+	if (cgc.buflen > sizeof(disc_information))
+		cgc.buflen = sizeof(disc_information);
+
+	cgc.cmd[8] = cgc.buflen;
+	return pkt_generic_packet(pd, &cgc);
+}
+
+static int pkt_get_track_info(struct pktcdvd_device *pd, __u16 track, __u8 type, track_information *ti)
+{
+	struct packet_command cgc;
+	int ret;
+
+	init_cdrom_command(&cgc, ti, 8, CGC_DATA_READ);
+	cgc.cmd[0] = GPCMD_READ_TRACK_RZONE_INFO;
+	cgc.cmd[1] = type & 3;
+	cgc.cmd[4] = (track & 0xff00) >> 8;
+	cgc.cmd[5] = track & 0xff;
+	cgc.cmd[8] = 8;
+	cgc.quiet = 1;
+
+	if ((ret = pkt_generic_packet(pd, &cgc)))
+		return ret;
+
+	cgc.buflen = be16_to_cpu(ti->track_information_length) +
+		     sizeof(ti->track_information_length);
+
+	if (cgc.buflen > sizeof(track_information))
+		cgc.buflen = sizeof(track_information);
+
+	cgc.cmd[8] = cgc.buflen;
+	return pkt_generic_packet(pd, &cgc);
+}
+
+static int pkt_get_last_written(struct pktcdvd_device *pd, long *last_written)
+{
+	disc_information di;
+	track_information ti;
+	__u32 last_track;
+	int ret = -1;
+
+	if ((ret = pkt_get_disc_info(pd, &di)))
+		return ret;
+
+	last_track = (di.last_track_msb << 8) | di.last_track_lsb;
+	if ((ret = pkt_get_track_info(pd, last_track, 1, &ti)))
+		return ret;
+
+	/* if this track is blank, try the previous. */
+	if (ti.blank) {
+		last_track--;
+		if ((ret = pkt_get_track_info(pd, last_track, 1, &ti)))
+			return ret;
+	}
+
+	/* if last recorded field is valid, return it. */
+	if (ti.lra_v) {
+		*last_written = be32_to_cpu(ti.last_rec_address);
+	} else {
+		/* make it up instead */
+		*last_written = be32_to_cpu(ti.track_start) +
+				be32_to_cpu(ti.track_size);
+		if (ti.free_blocks)
+			*last_written -= (be32_to_cpu(ti.free_blocks) + 7);
+	}
+	return 0;
+}
+
+/*
+ * write mode select package based on pd->settings
+ */
+static int pkt_set_write_settings(struct pktcdvd_device *pd)
+{
+	struct packet_command cgc;
+	struct request_sense sense;
+	write_param_page *wp;
+	char buffer[128];
+	int ret, size;
+
+	/* doesn't apply to DVD+RW */
+	if (pd->mmc3_profile == 0x1a)
+		return 0;
+
+	memset(buffer, 0, sizeof(buffer));
+	init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ);
+	cgc.sense = &sense;
+	if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) {
+		pkt_dump_sense(&cgc);
+		return ret;
+	}
+
+	size = 2 + ((buffer[0] << 8) | (buffer[1] & 0xff));
+	pd->mode_offset = (buffer[6] << 8) | (buffer[7] & 0xff);
+	if (size > sizeof(buffer))
+		size = sizeof(buffer);
+
+	/*
+	 * now get it all
+	 */
+	init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ);
+	cgc.sense = &sense;
+	if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) {
+		pkt_dump_sense(&cgc);
+		return ret;
+	}
+
+	/*
+	 * write page is offset header + block descriptor length
+	 */
+	wp = (write_param_page *) &buffer[sizeof(struct mode_page_header) + pd->mode_offset];
+
+	wp->fp = pd->settings.fp;
+	wp->track_mode = pd->settings.track_mode;
+	wp->write_type = pd->settings.write_type;
+	wp->data_block_type = pd->settings.block_mode;
+
+	wp->multi_session = 0;
+
+#ifdef PACKET_USE_LS
+	wp->link_size = 7;
+	wp->ls_v = 1;
+#endif
+
+	if (wp->data_block_type == PACKET_BLOCK_MODE1) {
+		wp->session_format = 0;
+		wp->subhdr2 = 0x20;
+	} else if (wp->data_block_type == PACKET_BLOCK_MODE2) {
+		wp->session_format = 0x20;
+		wp->subhdr2 = 8;
+#if 0
+		wp->mcn[0] = 0x80;
+		memcpy(&wp->mcn[1], PACKET_MCN, sizeof(wp->mcn) - 1);
+#endif
+	} else {
+		/*
+		 * paranoia
+		 */
+		printk("pktcdvd: write mode wrong %d\n", wp->data_block_type);
+		return 1;
+	}
+	wp->packet_size = cpu_to_be32(pd->settings.size >> 2);
+
+	cgc.buflen = cgc.cmd[8] = size;
+	if ((ret = pkt_mode_select(pd, &cgc))) {
+		pkt_dump_sense(&cgc);
+		return ret;
+	}
+
+	pkt_print_settings(pd);
+	return 0;
+}
+
+/*
+ * 0 -- we can write to this track, 1 -- we can't
+ */
+static int pkt_good_track(track_information *ti)
+{
+	/*
+	 * only good for CD-RW at the moment, not DVD-RW
+	 */
+
+	/*
+	 * FIXME: only for FP
+	 */
+	if (ti->fp == 0)
+		return 0;
+
+	/*
+	 * "good" settings as per Mt Fuji.
+	 */
+	if (ti->rt == 0 && ti->blank == 0 && ti->packet == 1)
+		return 0;
+
+	if (ti->rt == 0 && ti->blank == 1 && ti->packet == 1)
+		return 0;
+
+	if (ti->rt == 1 && ti->blank == 0 && ti->packet == 1)
+		return 0;
+
+	printk("pktcdvd: bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet);
+	return 1;
+}
+
+/*
+ * 0 -- we can write to this disc, 1 -- we can't
+ */
+static int pkt_good_disc(struct pktcdvd_device *pd, disc_information *di)
+{
+	switch (pd->mmc3_profile) {
+		case 0x0a: /* CD-RW */
+		case 0xffff: /* MMC3 not supported */
+			break;
+		case 0x1a: /* DVD+RW */
+		case 0x13: /* DVD-RW */
+			return 0;
+		default:
+			printk("pktcdvd: Wrong disc profile (%x)\n", pd->mmc3_profile);
+			return 1;
+	}
+
+	/*
+	 * for disc type 0xff we should probably reserve a new track.
+	 * but i'm not sure, should we leave this to user apps? probably.
+	 */
+	if (di->disc_type == 0xff) {
+		printk("pktcdvd: Unknown disc. No track?\n");
+		return 1;
+	}
+
+	if (di->disc_type != 0x20 && di->disc_type != 0) {
+		printk("pktcdvd: Wrong disc type (%x)\n", di->disc_type);
+		return 1;
+	}
+
+	if (di->erasable == 0) {
+		printk("pktcdvd: Disc not erasable\n");
+		return 1;
+	}
+
+	if (di->border_status == PACKET_SESSION_RESERVED) {
+		printk("pktcdvd: Can't write to last track (reserved)\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+static int pkt_probe_settings(struct pktcdvd_device *pd)
+{
+	struct packet_command cgc;
+	unsigned char buf[12];
+	disc_information di;
+	track_information ti;
+	int ret, track;
+
+	init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
+	cgc.cmd[0] = GPCMD_GET_CONFIGURATION;
+	cgc.cmd[8] = 8;
+	ret = pkt_generic_packet(pd, &cgc);
+	pd->mmc3_profile = ret ? 0xffff : buf[6] << 8 | buf[7];
+
+	memset(&di, 0, sizeof(disc_information));
+	memset(&ti, 0, sizeof(track_information));
+
+	if ((ret = pkt_get_disc_info(pd, &di))) {
+		printk("failed get_disc\n");
+		return ret;
+	}
+
+	if (pkt_good_disc(pd, &di))
+		return -ENXIO;
+
+	switch (pd->mmc3_profile) {
+		case 0x1a: /* DVD+RW */
+			printk("pktcdvd: inserted media is DVD+RW\n");
+			break;
+		case 0x13: /* DVD-RW */
+			printk("pktcdvd: inserted media is DVD-RW\n");
+			break;
+		default:
+			printk("pktcdvd: inserted media is CD-R%s\n", di.erasable ? "W" : "");
+			break;
+	}
+	pd->type = di.erasable ? PACKET_CDRW : PACKET_CDR;
+
+	track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */
+	if ((ret = pkt_get_track_info(pd, track, 1, &ti))) {
+		printk("pktcdvd: failed get_track\n");
+		return ret;
+	}
+
+	if (pkt_good_track(&ti)) {
+		printk("pktcdvd: can't write to this track\n");
+		return -ENXIO;
+	}
+
+	/*
+	 * we keep packet size in 512 byte units, makes it easier to
+	 * deal with request calculations.
+	 */
+	pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2;
+	if (pd->settings.size == 0) {
+		printk("pktcdvd: detected zero packet size!\n");
+		pd->settings.size = 128;
+	}
+	pd->settings.fp = ti.fp;
+	pd->offset = (be32_to_cpu(ti.track_start) << 2) & (pd->settings.size - 1);
+
+	if (ti.nwa_v) {
+		pd->nwa = be32_to_cpu(ti.next_writable);
+		set_bit(PACKET_NWA_VALID, &pd->flags);
+	}
+
+	/*
+	 * in theory we could use lra on -RW media as well and just zero
+	 * blocks that haven't been written yet, but in practice that
+	 * is just a no-go. we'll use that for -R, naturally.
+	 */
+	if (ti.lra_v) {
+		pd->lra = be32_to_cpu(ti.last_rec_address);
+		set_bit(PACKET_LRA_VALID, &pd->flags);
+	} else {
+		pd->lra = 0xffffffff;
+		set_bit(PACKET_LRA_VALID, &pd->flags);
+	}
+
+	/*
+	 * fine for now
+	 */
+	pd->settings.link_loss = 7;
+	pd->settings.write_type = 0;	/* packet */
+	pd->settings.track_mode = ti.track_mode;
+
+	/*
+	 * mode1 or mode2 disc
+	 */
+	switch (ti.data_mode) {
+		case PACKET_MODE1:
+			pd->settings.block_mode = PACKET_BLOCK_MODE1;
+			break;
+		case PACKET_MODE2:
+			pd->settings.block_mode = PACKET_BLOCK_MODE2;
+			break;
+		default:
+			printk("pktcdvd: unknown data mode\n");
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * enable/disable write caching on drive
+ */
+static int pkt_write_caching(struct pktcdvd_device *pd, int set)
+{
+	struct packet_command cgc;
+	struct request_sense sense;
+	unsigned char buf[64];
+	int ret;
+
+	memset(buf, 0, sizeof(buf));
+	init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
+	cgc.sense = &sense;
+	cgc.buflen = pd->mode_offset + 12;
+
+	/*
+	 * caching mode page might not be there, so quiet this command
+	 */
+	cgc.quiet = 1;
+
+	if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0)))
+		return ret;
+
+	buf[pd->mode_offset + 10] |= (!!set << 2);
+
+	cgc.buflen = cgc.cmd[8] = 2 + ((buf[0] << 8) | (buf[1] & 0xff));
+	ret = pkt_mode_select(pd, &cgc);
+	if (ret) {
+		printk("pktcdvd: write caching control failed\n");
+		pkt_dump_sense(&cgc);
+	} else if (!ret && set)
+		printk("pktcdvd: enabled write caching on %s\n", pd->name);
+	return ret;
+}
+
+static int pkt_lock_door(struct pktcdvd_device *pd, int lockflag)
+{
+	struct packet_command cgc;
+
+	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
+	cgc.cmd[0] = GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL;
+	cgc.cmd[4] = lockflag ? 1 : 0;
+	return pkt_generic_packet(pd, &cgc);
+}
+
+/*
+ * Returns drive maximum write speed
+ */
+static int pkt_get_max_speed(struct pktcdvd_device *pd, unsigned *write_speed)
+{
+	struct packet_command cgc;
+	struct request_sense sense;
+	unsigned char buf[256+18];
+	unsigned char *cap_buf;
+	int ret, offset;
+
+	memset(buf, 0, sizeof(buf));
+	cap_buf = &buf[sizeof(struct mode_page_header) + pd->mode_offset];
+	init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_UNKNOWN);
+	cgc.sense = &sense;
+
+	ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
+	if (ret) {
+		cgc.buflen = pd->mode_offset + cap_buf[1] + 2 +
+			     sizeof(struct mode_page_header);
+		ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
+		if (ret) {
+			pkt_dump_sense(&cgc);
+			return ret;
+		}
+	}
+
+	offset = 20;			    /* Obsoleted field, used by older drives */
+	if (cap_buf[1] >= 28)
+		offset = 28;		    /* Current write speed selected */
+	if (cap_buf[1] >= 30) {
+		/* If the drive reports at least one "Logical Unit Write
+		 * Speed Performance Descriptor Block", use the information
+		 * in the first block. (contains the highest speed)
+		 */
+		int num_spdb = (cap_buf[30] << 8) + cap_buf[31];
+		if (num_spdb > 0)
+			offset = 34;
+	}
+
+	*write_speed = (cap_buf[offset] << 8) | cap_buf[offset + 1];
+	return 0;
+}
+
+/* These tables from cdrecord - I don't have orange book */
+/* standard speed CD-RW (1-4x) */
+static char clv_to_speed[16] = {
+	/* 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 */
+	   0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+/* high speed CD-RW (-10x) */
+static char hs_clv_to_speed[16] = {
+	/* 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 */
+	   0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+/* ultra high speed CD-RW */
+static char us_clv_to_speed[16] = {
+	/* 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 */
+	   0, 2, 4, 8, 0, 0,16, 0,24,32,40,48, 0, 0, 0, 0
+};
+
+/*
+ * reads the maximum media speed from ATIP
+ */
+static int pkt_media_speed(struct pktcdvd_device *pd, unsigned *speed)
+{
+	struct packet_command cgc;
+	struct request_sense sense;
+	unsigned char buf[64];
+	unsigned int size, st, sp;
+	int ret;
+
+	init_cdrom_command(&cgc, buf, 2, CGC_DATA_READ);
+	cgc.sense = &sense;
+	cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
+	cgc.cmd[1] = 2;
+	cgc.cmd[2] = 4; /* READ ATIP */
+	cgc.cmd[8] = 2;
+	ret = pkt_generic_packet(pd, &cgc);
+	if (ret) {
+		pkt_dump_sense(&cgc);
+		return ret;
+	}
+	size = ((unsigned int) buf[0]<<8) + buf[1] + 2;
+	if (size > sizeof(buf))
+		size = sizeof(buf);
+
+	init_cdrom_command(&cgc, buf, size, CGC_DATA_READ);
+	cgc.sense = &sense;
+	cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
+	cgc.cmd[1] = 2;
+	cgc.cmd[2] = 4;
+	cgc.cmd[8] = size;
+	ret = pkt_generic_packet(pd, &cgc);
+	if (ret) {
+		pkt_dump_sense(&cgc);
+		return ret;
+	}
+
+	if (!buf[6] & 0x40) {
+		printk("pktcdvd: Disc type is not CD-RW\n");
+		return 1;
+	}
+	if (!buf[6] & 0x4) {
+		printk("pktcdvd: A1 values on media are not valid, maybe not CDRW?\n");
+		return 1;
+	}
+
+	st = (buf[6] >> 3) & 0x7; /* disc sub-type */
+
+	sp = buf[16] & 0xf; /* max speed from ATIP A1 field */
+
+	/* Info from cdrecord */
+	switch (st) {
+		case 0: /* standard speed */
+			*speed = clv_to_speed[sp];
+			break;
+		case 1: /* high speed */
+			*speed = hs_clv_to_speed[sp];
+			break;
+		case 2: /* ultra high speed */
+			*speed = us_clv_to_speed[sp];
+			break;
+		default:
+			printk("pktcdvd: Unknown disc sub-type %d\n",st);
+			return 1;
+	}
+	if (*speed) {
+		printk("pktcdvd: Max. media speed: %d\n",*speed);
+		return 0;
+	} else {
+		printk("pktcdvd: Unknown speed %d for sub-type %d\n",sp,st);
+		return 1;
+	}
+}
+
+static int pkt_perform_opc(struct pktcdvd_device *pd)
+{
+	struct packet_command cgc;
+	struct request_sense sense;
+	int ret;
+
+	VPRINTK("pktcdvd: Performing OPC\n");
+
+	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
+	cgc.sense = &sense;
+	cgc.timeout = 60*HZ;
+	cgc.cmd[0] = GPCMD_SEND_OPC;
+	cgc.cmd[1] = 1;
+	if ((ret = pkt_generic_packet(pd, &cgc)))
+		pkt_dump_sense(&cgc);
+	return ret;
+}
+
+static int pkt_open_write(struct pktcdvd_device *pd)
+{
+	int ret;
+	unsigned int write_speed, media_write_speed, read_speed;
+
+	if ((ret = pkt_probe_settings(pd))) {
+		DPRINTK("pktcdvd: %s failed probe\n", pd->name);
+		return -EIO;
+	}
+
+	if ((ret = pkt_set_write_settings(pd))) {
+		DPRINTK("pktcdvd: %s failed saving write settings\n", pd->name);
+		return -EIO;
+	}
+
+	pkt_write_caching(pd, USE_WCACHING);
+
+	if ((ret = pkt_get_max_speed(pd, &write_speed)))
+		write_speed = 16 * 177;
+	switch (pd->mmc3_profile) {
+		case 0x13: /* DVD-RW */
+		case 0x1a: /* DVD+RW */
+			DPRINTK("pktcdvd: write speed %ukB/s\n", write_speed);
+			break;
+		default:
+			if ((ret = pkt_media_speed(pd, &media_write_speed)))
+				media_write_speed = 16;
+			write_speed = min(write_speed, media_write_speed * 177);
+			DPRINTK("pktcdvd: write speed %ux\n", write_speed / 176);
+			break;
+	}
+	read_speed = write_speed;
+
+	if ((ret = pkt_set_speed(pd, write_speed, read_speed))) {
+		DPRINTK("pktcdvd: %s couldn't set write speed\n", pd->name);
+		return -EIO;
+	}
+	pd->write_speed = write_speed;
+	pd->read_speed = read_speed;
+
+	if ((ret = pkt_perform_opc(pd))) {
+		DPRINTK("pktcdvd: %s Optimum Power Calibration failed\n", pd->name);
+	}
+
+	return 0;
+}
+
+/*
+ * called at open time.
+ */
+static int pkt_open_dev(struct pktcdvd_device *pd, int write)
+{
+	int ret;
+	long lba;
+	request_queue_t *q;
+
+	/*
+	 * We need to re-open the cdrom device without O_NONBLOCK to be able
+	 * to read/write from/to it. It is already opened in O_NONBLOCK mode
+	 * so bdget() can't fail.
+	 */
+	bdget(pd->bdev->bd_dev);
+	if ((ret = blkdev_get(pd->bdev, FMODE_READ, O_RDONLY)))
+		goto out;
+
+	if ((ret = pkt_get_last_written(pd, &lba))) {
+		printk("pktcdvd: pkt_get_last_written failed\n");
+		goto out_putdev;
+	}
+
+	set_capacity(pd->disk, lba << 2);
+	set_capacity(pd->bdev->bd_disk, lba << 2);
+	bd_set_size(pd->bdev, (loff_t)lba << 11);
+
+	q = bdev_get_queue(pd->bdev);
+	if (write) {
+		if ((ret = pkt_open_write(pd)))
+			goto out_putdev;
+		/*
+		 * Some CDRW drives can not handle writes larger than one packet,
+		 * even if the size is a multiple of the packet size.
+		 */
+		spin_lock_irq(q->queue_lock);
+		blk_queue_max_sectors(q, pd->settings.size);
+		spin_unlock_irq(q->queue_lock);
+		set_bit(PACKET_WRITABLE, &pd->flags);
+	} else {
+		pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
+		clear_bit(PACKET_WRITABLE, &pd->flags);
+	}
+
+	if ((ret = pkt_set_segment_merging(pd, q)))
+		goto out_putdev;
+
+	if (write)
+		printk("pktcdvd: %lukB available on disc\n", lba << 1);
+
+	return 0;
+
+out_putdev:
+	blkdev_put(pd->bdev);
+out:
+	return ret;
+}
+
+/*
+ * called when the device is closed. makes sure that the device flushes
+ * the internal cache before we close.
+ */
+static void pkt_release_dev(struct pktcdvd_device *pd, int flush)
+{
+	if (flush && pkt_flush_cache(pd))
+		DPRINTK("pktcdvd: %s not flushing cache\n", pd->name);
+
+	pkt_lock_door(pd, 0);
+
+	pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
+	blkdev_put(pd->bdev);
+}
+
+static struct pktcdvd_device *pkt_find_dev_from_minor(int dev_minor)
+{
+	if (dev_minor >= MAX_WRITERS)
+		return NULL;
+	return pkt_devs[dev_minor];
+}
+
+static int pkt_open(struct inode *inode, struct file *file)
+{
+	struct pktcdvd_device *pd = NULL;
+	int ret;
+
+	VPRINTK("pktcdvd: entering open\n");
+
+	down(&ctl_mutex);
+	pd = pkt_find_dev_from_minor(iminor(inode));
+	if (!pd) {
+		ret = -ENODEV;
+		goto out;
+	}
+	BUG_ON(pd->refcnt < 0);
+
+	pd->refcnt++;
+	if (pd->refcnt == 1) {
+		if (pkt_open_dev(pd, file->f_mode & FMODE_WRITE)) {
+			ret = -EIO;
+			goto out_dec;
+		}
+		/*
+		 * needed here as well, since ext2 (among others) may change
+		 * the blocksize at mount time
+		 */
+		set_blocksize(inode->i_bdev, CD_FRAMESIZE);
+	}
+
+	up(&ctl_mutex);
+	return 0;
+
+out_dec:
+	pd->refcnt--;
+out:
+	VPRINTK("pktcdvd: failed open (%d)\n", ret);
+	up(&ctl_mutex);
+	return ret;
+}
+
+static int pkt_close(struct inode *inode, struct file *file)
+{
+	struct pktcdvd_device *pd = inode->i_bdev->bd_disk->private_data;
+	int ret = 0;
+
+	down(&ctl_mutex);
+	pd->refcnt--;
+	BUG_ON(pd->refcnt < 0);
+	if (pd->refcnt == 0) {
+		int flush = test_bit(PACKET_WRITABLE, &pd->flags);
+		pkt_release_dev(pd, flush);
+	}
+	up(&ctl_mutex);
+	return ret;
+}
+
+
+static void *psd_pool_alloc(int gfp_mask, void *data)
+{
+	return kmalloc(sizeof(struct packet_stacked_data), gfp_mask);
+}
+
+static void psd_pool_free(void *ptr, void *data)
+{
+	kfree(ptr);
+}
+
+static int pkt_end_io_read_cloned(struct bio *bio, unsigned int bytes_done, int err)
+{
+	struct packet_stacked_data *psd = bio->bi_private;
+	struct pktcdvd_device *pd = psd->pd;
+
+	if (bio->bi_size)
+		return 1;
+
+	bio_put(bio);
+	bio_endio(psd->bio, psd->bio->bi_size, err);
+	mempool_free(psd, psd_pool);
+	pkt_bio_finished(pd);
+	return 0;
+}
+
+static int pkt_make_request(request_queue_t *q, struct bio *bio)
+{
+	struct pktcdvd_device *pd;
+	char b[BDEVNAME_SIZE];
+	sector_t zone;
+	struct packet_data *pkt;
+	int was_empty, blocked_bio;
+	struct pkt_rb_node *node;
+
+	pd = q->queuedata;
+	if (!pd) {
+		printk("pktcdvd: %s incorrect request queue\n", bdevname(bio->bi_bdev, b));
+		goto end_io;
+	}
+
+	/*
+	 * Clone READ bios so we can have our own bi_end_io callback.
+	 */
+	if (bio_data_dir(bio) == READ) {
+		struct bio *cloned_bio = bio_clone(bio, GFP_NOIO);
+		struct packet_stacked_data *psd = mempool_alloc(psd_pool, GFP_NOIO);
+
+		psd->pd = pd;
+		psd->bio = bio;
+		cloned_bio->bi_bdev = pd->bdev;
+		cloned_bio->bi_private = psd;
+		cloned_bio->bi_end_io = pkt_end_io_read_cloned;
+		pd->stats.secs_r += bio->bi_size >> 9;
+		pkt_queue_bio(pd, cloned_bio, 1);
+		return 0;
+	}
+
+	if (!test_bit(PACKET_WRITABLE, &pd->flags)) {
+		printk("pktcdvd: WRITE for ro device %s (%llu)\n",
+			pd->name, (unsigned long long)bio->bi_sector);
+		goto end_io;
+	}
+
+	if (!bio->bi_size || (bio->bi_size % CD_FRAMESIZE)) {
+		printk("pktcdvd: wrong bio size\n");
+		goto end_io;
+	}
+
+	blk_queue_bounce(q, &bio);
+
+	zone = ZONE(bio->bi_sector, pd);
+	VPRINTK("pkt_make_request: start = %6llx stop = %6llx\n",
+		(unsigned long long)bio->bi_sector,
+		(unsigned long long)(bio->bi_sector + bio_sectors(bio)));
+
+	/* Check if we have to split the bio */
+	{
+		struct bio_pair *bp;
+		sector_t last_zone;
+		int first_sectors;
+
+		last_zone = ZONE(bio->bi_sector + bio_sectors(bio) - 1, pd);
+		if (last_zone != zone) {
+			BUG_ON(last_zone != zone + pd->settings.size);
+			first_sectors = last_zone - bio->bi_sector;
+			bp = bio_split(bio, bio_split_pool, first_sectors);
+			BUG_ON(!bp);
+			pkt_make_request(q, &bp->bio1);
+			pkt_make_request(q, &bp->bio2);
+			bio_pair_release(bp);
+			return 0;
+		}
+	}
+
+	/*
+	 * If we find a matching packet in state WAITING or READ_WAIT, we can
+	 * just append this bio to that packet.
+	 */
+	spin_lock(&pd->cdrw.active_list_lock);
+	blocked_bio = 0;
+	list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+		if (pkt->sector == zone) {
+			spin_lock(&pkt->lock);
+			if ((pkt->state == PACKET_WAITING_STATE) ||
+			    (pkt->state == PACKET_READ_WAIT_STATE)) {
+				pkt_add_list_last(bio, &pkt->orig_bios,
+						  &pkt->orig_bios_tail);
+				pkt->write_size += bio->bi_size / CD_FRAMESIZE;
+				if ((pkt->write_size >= pkt->frames) &&
+				    (pkt->state == PACKET_WAITING_STATE)) {
+					atomic_inc(&pkt->run_sm);
+					wake_up(&pd->wqueue);
+				}
+				spin_unlock(&pkt->lock);
+				spin_unlock(&pd->cdrw.active_list_lock);
+				return 0;
+			} else {
+				blocked_bio = 1;
+			}
+			spin_unlock(&pkt->lock);
+		}
+	}
+	spin_unlock(&pd->cdrw.active_list_lock);
+
+	/*
+	 * No matching packet found. Store the bio in the work queue.
+	 */
+	node = mempool_alloc(pd->rb_pool, GFP_NOIO);
+	BUG_ON(!node);
+	node->bio = bio;
+	spin_lock(&pd->lock);
+	BUG_ON(pd->bio_queue_size < 0);
+	was_empty = (pd->bio_queue_size == 0);
+	pkt_rbtree_insert(pd, node);
+	spin_unlock(&pd->lock);
+
+	/*
+	 * Wake up the worker thread.
+	 */
+	atomic_set(&pd->scan_queue, 1);
+	if (was_empty) {
+		/* This wake_up is required for correct operation */
+		wake_up(&pd->wqueue);
+	} else if (!list_empty(&pd->cdrw.pkt_free_list) && !blocked_bio) {
+		/*
+		 * This wake up is not required for correct operation,
+		 * but improves performance in some cases.
+		 */
+		wake_up(&pd->wqueue);
+	}
+	return 0;
+end_io:
+	bio_io_error(bio, bio->bi_size);
+	return 0;
+}
+
+
+
+static int pkt_merge_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *bvec)
+{
+	struct pktcdvd_device *pd = q->queuedata;
+	sector_t zone = ZONE(bio->bi_sector, pd);
+	int used = ((bio->bi_sector - zone) << 9) + bio->bi_size;
+	int remaining = (pd->settings.size << 9) - used;
+	int remaining2;
+
+	/*
+	 * A bio <= PAGE_SIZE must be allowed. If it crosses a packet
+	 * boundary, pkt_make_request() will split the bio.
+	 */
+	remaining2 = PAGE_SIZE - bio->bi_size;
+	remaining = max(remaining, remaining2);
+
+	BUG_ON(remaining < 0);
+	return remaining;
+}
+
+static void pkt_init_queue(struct pktcdvd_device *pd)
+{
+	request_queue_t *q = pd->disk->queue;
+
+	blk_queue_make_request(q, pkt_make_request);
+	blk_queue_hardsect_size(q, CD_FRAMESIZE);
+	blk_queue_max_sectors(q, PACKET_MAX_SECTORS);
+	blk_queue_merge_bvec(q, pkt_merge_bvec);
+	q->queuedata = pd;
+}
+
+static int pkt_seq_show(struct seq_file *m, void *p)
+{
+	struct pktcdvd_device *pd = m->private;
+	char *msg;
+	char bdev_buf[BDEVNAME_SIZE];
+	int states[PACKET_NUM_STATES];
+
+	seq_printf(m, "Writer %s mapped to %s:\n", pd->name,
+		   bdevname(pd->bdev, bdev_buf));
+
+	seq_printf(m, "\nSettings:\n");
+	seq_printf(m, "\tpacket size:\t\t%dkB\n", pd->settings.size / 2);
+
+	if (pd->settings.write_type == 0)
+		msg = "Packet";
+	else
+		msg = "Unknown";
+	seq_printf(m, "\twrite type:\t\t%s\n", msg);
+
+	seq_printf(m, "\tpacket type:\t\t%s\n", pd->settings.fp ? "Fixed" : "Variable");
+	seq_printf(m, "\tlink loss:\t\t%d\n", pd->settings.link_loss);
+
+	seq_printf(m, "\ttrack mode:\t\t%d\n", pd->settings.track_mode);
+
+	if (pd->settings.block_mode == PACKET_BLOCK_MODE1)
+		msg = "Mode 1";
+	else if (pd->settings.block_mode == PACKET_BLOCK_MODE2)
+		msg = "Mode 2";
+	else
+		msg = "Unknown";
+	seq_printf(m, "\tblock mode:\t\t%s\n", msg);
+
+	seq_printf(m, "\nStatistics:\n");
+	seq_printf(m, "\tpackets started:\t%lu\n", pd->stats.pkt_started);
+	seq_printf(m, "\tpackets ended:\t\t%lu\n", pd->stats.pkt_ended);
+	seq_printf(m, "\twritten:\t\t%lukB\n", pd->stats.secs_w >> 1);
+	seq_printf(m, "\tread gather:\t\t%lukB\n", pd->stats.secs_rg >> 1);
+	seq_printf(m, "\tread:\t\t\t%lukB\n", pd->stats.secs_r >> 1);
+
+	seq_printf(m, "\nMisc:\n");
+	seq_printf(m, "\treference count:\t%d\n", pd->refcnt);
+	seq_printf(m, "\tflags:\t\t\t0x%lx\n", pd->flags);
+	seq_printf(m, "\tread speed:\t\t%ukB/s\n", pd->read_speed);
+	seq_printf(m, "\twrite speed:\t\t%ukB/s\n", pd->write_speed);
+	seq_printf(m, "\tstart offset:\t\t%lu\n", pd->offset);
+	seq_printf(m, "\tmode page offset:\t%u\n", pd->mode_offset);
+
+	seq_printf(m, "\nQueue state:\n");
+	seq_printf(m, "\tbios queued:\t\t%d\n", pd->bio_queue_size);
+	seq_printf(m, "\tbios pending:\t\t%d\n", atomic_read(&pd->cdrw.pending_bios));
+	seq_printf(m, "\tcurrent sector:\t\t0x%llx\n", (unsigned long long)pd->current_sector);
+
+	pkt_count_states(pd, states);
+	seq_printf(m, "\tstate:\t\t\ti:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n",
+		   states[0], states[1], states[2], states[3], states[4], states[5]);
+
+	return 0;
+}
+
+static int pkt_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, pkt_seq_show, PDE(inode)->data);
+}
+
+static struct file_operations pkt_proc_fops = {
+	.open	= pkt_seq_open,
+	.read	= seq_read,
+	.llseek	= seq_lseek,
+	.release = single_release
+};
+
+static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
+{
+	int i;
+	int ret = 0;
+	char b[BDEVNAME_SIZE];
+	struct proc_dir_entry *proc;
+	struct block_device *bdev;
+
+	if (pd->pkt_dev == dev) {
+		printk("pktcdvd: Recursive setup not allowed\n");
+		return -EBUSY;
+	}
+	for (i = 0; i < MAX_WRITERS; i++) {
+		struct pktcdvd_device *pd2 = pkt_devs[i];
+		if (!pd2)
+			continue;
+		if (pd2->bdev->bd_dev == dev) {
+			printk("pktcdvd: %s already setup\n", bdevname(pd2->bdev, b));
+			return -EBUSY;
+		}
+		if (pd2->pkt_dev == dev) {
+			printk("pktcdvd: Can't chain pktcdvd devices\n");
+			return -EBUSY;
+		}
+	}
+
+	bdev = bdget(dev);
+	if (!bdev)
+		return -ENOMEM;
+	ret = blkdev_get(bdev, FMODE_READ, O_RDONLY | O_NONBLOCK);
+	if (ret)
+		return ret;
+
+	/* This is safe, since we have a reference from open(). */
+	__module_get(THIS_MODULE);
+
+	if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) {
+		printk("pktcdvd: not enough memory for buffers\n");
+		ret = -ENOMEM;
+		goto out_mem;
+	}
+
+	pd->bdev = bdev;
+	set_blocksize(bdev, CD_FRAMESIZE);
+
+	pkt_init_queue(pd);
+
+	atomic_set(&pd->cdrw.pending_bios, 0);
+	pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->name);
+	if (IS_ERR(pd->cdrw.thread)) {
+		printk("pktcdvd: can't start kernel thread\n");
+		ret = -ENOMEM;
+		goto out_thread;
+	}
+
+	proc = create_proc_entry(pd->name, 0, pkt_proc);
+	if (proc) {
+		proc->data = pd;
+		proc->proc_fops = &pkt_proc_fops;
+	}
+	DPRINTK("pktcdvd: writer %s mapped to %s\n", pd->name, bdevname(bdev, b));
+	return 0;
+
+out_thread:
+	pkt_shrink_pktlist(pd);
+out_mem:
+	blkdev_put(bdev);
+	/* This is safe: open() is still holding a reference. */
+	module_put(THIS_MODULE);
+	return ret;
+}
+
+static int pkt_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct pktcdvd_device *pd = inode->i_bdev->bd_disk->private_data;
+
+	VPRINTK("pkt_ioctl: cmd %x, dev %d:%d\n", cmd, imajor(inode), iminor(inode));
+	BUG_ON(!pd);
+
+	switch (cmd) {
+	/*
+	 * forward selected CDROM ioctls to CD-ROM, for UDF
+	 */
+	case CDROMMULTISESSION:
+	case CDROMREADTOCENTRY:
+	case CDROM_LAST_WRITTEN:
+	case CDROM_SEND_PACKET:
+	case SCSI_IOCTL_SEND_COMMAND:
+		return ioctl_by_bdev(pd->bdev, cmd, arg);
+
+	case CDROMEJECT:
+		/*
+		 * The door gets locked when the device is opened, so we
+		 * have to unlock it or else the eject command fails.
+		 */
+		pkt_lock_door(pd, 0);
+		return ioctl_by_bdev(pd->bdev, cmd, arg);
+
+	default:
+		printk("pktcdvd: Unknown ioctl for %s (%x)\n", pd->name, cmd);
+		return -ENOTTY;
+	}
+
+	return 0;
+}
+
+static int pkt_media_changed(struct gendisk *disk)
+{
+	struct pktcdvd_device *pd = disk->private_data;
+	struct gendisk *attached_disk;
+
+	if (!pd)
+		return 0;
+	if (!pd->bdev)
+		return 0;
+	attached_disk = pd->bdev->bd_disk;
+	if (!attached_disk)
+		return 0;
+	return attached_disk->fops->media_changed(attached_disk);
+}
+
+static struct block_device_operations pktcdvd_ops = {
+	.owner =		THIS_MODULE,
+	.open =			pkt_open,
+	.release =		pkt_close,
+	.ioctl =		pkt_ioctl,
+	.media_changed =	pkt_media_changed,
+};
+
+/*
+ * Set up mapping from pktcdvd device to CD-ROM device.
+ */
+static int pkt_setup_dev(struct pkt_ctrl_command *ctrl_cmd)
+{
+	int idx;
+	int ret = -ENOMEM;
+	struct pktcdvd_device *pd;
+	struct gendisk *disk;
+	dev_t dev = new_decode_dev(ctrl_cmd->dev);
+
+	for (idx = 0; idx < MAX_WRITERS; idx++)
+		if (!pkt_devs[idx])
+			break;
+	if (idx == MAX_WRITERS) {
+		printk("pktcdvd: max %d writers supported\n", MAX_WRITERS);
+		return -EBUSY;
+	}
+
+	pd = kmalloc(sizeof(struct pktcdvd_device), GFP_KERNEL);
+	if (!pd)
+		return ret;
+	memset(pd, 0, sizeof(struct pktcdvd_device));
+
+	pd->rb_pool = mempool_create(PKT_RB_POOL_SIZE, pkt_rb_alloc, pkt_rb_free, NULL);
+	if (!pd->rb_pool)
+		goto out_mem;
+
+	disk = alloc_disk(1);
+	if (!disk)
+		goto out_mem;
+	pd->disk = disk;
+
+	spin_lock_init(&pd->lock);
+	spin_lock_init(&pd->iosched.lock);
+	sprintf(pd->name, "pktcdvd%d", idx);
+	init_waitqueue_head(&pd->wqueue);
+	pd->bio_queue = RB_ROOT;
+
+	disk->major = pkt_major;
+	disk->first_minor = idx;
+	disk->fops = &pktcdvd_ops;
+	disk->flags = GENHD_FL_REMOVABLE;
+	sprintf(disk->disk_name, "pktcdvd%d", idx);
+	disk->private_data = pd;
+	disk->queue = blk_alloc_queue(GFP_KERNEL);
+	if (!disk->queue)
+		goto out_mem2;
+
+	pd->pkt_dev = MKDEV(disk->major, disk->first_minor);
+	ret = pkt_new_dev(pd, dev);
+	if (ret)
+		goto out_new_dev;
+
+	add_disk(disk);
+	pkt_devs[idx] = pd;
+	ctrl_cmd->pkt_dev = new_encode_dev(pd->pkt_dev);
+	return 0;
+
+out_new_dev:
+	blk_put_queue(disk->queue);
+out_mem2:
+	put_disk(disk);
+out_mem:
+	if (pd->rb_pool)
+		mempool_destroy(pd->rb_pool);
+	kfree(pd);
+	return ret;
+}
+
+/*
+ * Tear down mapping from pktcdvd device to CD-ROM device.
+ */
+static int pkt_remove_dev(struct pkt_ctrl_command *ctrl_cmd)
+{
+	struct pktcdvd_device *pd;
+	int idx;
+	dev_t pkt_dev = new_decode_dev(ctrl_cmd->pkt_dev);
+
+	for (idx = 0; idx < MAX_WRITERS; idx++) {
+		pd = pkt_devs[idx];
+		if (pd && (pd->pkt_dev == pkt_dev))
+			break;
+	}
+	if (idx == MAX_WRITERS) {
+		DPRINTK("pktcdvd: dev not setup\n");
+		return -ENXIO;
+	}
+
+	if (pd->refcnt > 0)
+		return -EBUSY;
+
+	if (!IS_ERR(pd->cdrw.thread))
+		kthread_stop(pd->cdrw.thread);
+
+	blkdev_put(pd->bdev);
+
+	pkt_shrink_pktlist(pd);
+
+	remove_proc_entry(pd->name, pkt_proc);
+	DPRINTK("pktcdvd: writer %s unmapped\n", pd->name);
+
+	del_gendisk(pd->disk);
+	blk_put_queue(pd->disk->queue);
+	put_disk(pd->disk);
+
+	pkt_devs[idx] = NULL;
+	mempool_destroy(pd->rb_pool);
+	kfree(pd);
+
+	/* This is safe: open() is still holding a reference. */
+	module_put(THIS_MODULE);
+	return 0;
+}
+
+static void pkt_get_status(struct pkt_ctrl_command *ctrl_cmd)
+{
+	struct pktcdvd_device *pd = pkt_find_dev_from_minor(ctrl_cmd->dev_index);
+	if (pd) {
+		ctrl_cmd->dev = new_encode_dev(pd->bdev->bd_dev);
+		ctrl_cmd->pkt_dev = new_encode_dev(pd->pkt_dev);
+	} else {
+		ctrl_cmd->dev = 0;
+		ctrl_cmd->pkt_dev = 0;
+	}
+	ctrl_cmd->num_devices = MAX_WRITERS;
+}
+
+static int pkt_ctl_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	struct pkt_ctrl_command ctrl_cmd;
+	int ret = 0;
+
+	if (cmd != PACKET_CTRL_CMD)
+		return -ENOTTY;
+
+	if (copy_from_user(&ctrl_cmd, argp, sizeof(struct pkt_ctrl_command)))
+		return -EFAULT;
+
+	switch (ctrl_cmd.command) {
+	case PKT_CTRL_CMD_SETUP:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		down(&ctl_mutex);
+		ret = pkt_setup_dev(&ctrl_cmd);
+		up(&ctl_mutex);
+		break;
+	case PKT_CTRL_CMD_TEARDOWN:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		down(&ctl_mutex);
+		ret = pkt_remove_dev(&ctrl_cmd);
+		up(&ctl_mutex);
+		break;
+	case PKT_CTRL_CMD_STATUS:
+		down(&ctl_mutex);
+		pkt_get_status(&ctrl_cmd);
+		up(&ctl_mutex);
+		break;
+	default:
+		return -ENOTTY;
+	}
+
+	if (copy_to_user(argp, &ctrl_cmd, sizeof(struct pkt_ctrl_command)))
+		return -EFAULT;
+	return ret;
+}
+
+
+static struct file_operations pkt_ctl_fops = {
+	.ioctl	 = pkt_ctl_ioctl,
+	.owner	 = THIS_MODULE,
+};
+
+static struct miscdevice pkt_misc = {
+	.minor 		= MISC_DYNAMIC_MINOR,
+	.name  		= "pktcdvd",
+	.devfs_name 	= "pktcdvd/control",
+	.fops  		= &pkt_ctl_fops
+};
+
+int pkt_init(void)
+{
+	int ret;
+
+	psd_pool = mempool_create(PSD_POOL_SIZE, psd_pool_alloc, psd_pool_free, NULL);
+	if (!psd_pool)
+		return -ENOMEM;
+
+	ret = register_blkdev(pkt_major, "pktcdvd");
+	if (ret < 0) {
+		printk("pktcdvd: Unable to register block device\n");
+		goto out2;
+	}
+	if (!pkt_major)
+		pkt_major = ret;
+
+	ret = misc_register(&pkt_misc);
+	if (ret) {
+		printk("pktcdvd: Unable to register misc device\n");
+		goto out;
+	}
+
+	init_MUTEX(&ctl_mutex);
+
+	pkt_proc = proc_mkdir("pktcdvd", proc_root_driver);
+
+	DPRINTK("pktcdvd: %s\n", VERSION_CODE);
+	return 0;
+
+out:
+	unregister_blkdev(pkt_major, "pktcdvd");
+out2:
+	mempool_destroy(psd_pool);
+	return ret;
+}
+
+void pkt_exit(void)
+{
+	remove_proc_entry("pktcdvd", proc_root_driver);
+	misc_deregister(&pkt_misc);
+	unregister_blkdev(pkt_major, "pktcdvd");
+	mempool_destroy(psd_pool);
+}
+
+MODULE_DESCRIPTION("Packet writing layer for CD/DVD drives");
+MODULE_AUTHOR("Jens Axboe <axboe@suse.de>");
+MODULE_LICENSE("GPL");
+
+module_init(pkt_init);
+module_exit(pkt_exit);
diff --git a/drivers/cdrom/Makefile b/drivers/cdrom/Makefile
index 5c484f3b3e58..4a8351753e07 100644
--- a/drivers/cdrom/Makefile
+++ b/drivers/cdrom/Makefile
@@ -8,6 +8,7 @@
 obj-$(CONFIG_BLK_DEV_IDECD)	+=              cdrom.o
 obj-$(CONFIG_BLK_DEV_SR)	+=              cdrom.o
 obj-$(CONFIG_PARIDE_PCD)	+=		cdrom.o
+obj-$(CONFIG_CDROM_PKTCDVD)	+=		cdrom.o
 
 obj-$(CONFIG_AZTCD)		+= aztcd.o
 obj-$(CONFIG_CDU31A)		+= cdu31a.o     cdrom.o
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 468eec62246f..895f245fcced 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -2001,7 +2001,7 @@ ide_do_rw_cdrom (ide_drive_t *drive, struct request *rq, sector_t block)
 			}
 			CDROM_CONFIG_FLAGS(drive)->seeking = 0;
 		}
-		if (IDE_LARGE_SEEK(info->last_block, block, IDECD_SEEK_THRESHOLD) && drive->dsc_overlap) {
+		if ((rq_data_dir(rq) == READ) && IDE_LARGE_SEEK(info->last_block, block, IDECD_SEEK_THRESHOLD) && drive->dsc_overlap) {
 			action = cdrom_start_seek(drive, block);
 		} else {
 			if (rq_data_dir(rq) == READ)
@@ -2962,8 +2962,10 @@ int ide_cdrom_probe_capabilities (ide_drive_t *drive)
 		CDROM_CONFIG_FLAGS(drive)->no_eject = 0;
 	if (cap.cd_r_write)
 		CDROM_CONFIG_FLAGS(drive)->cd_r = 1;
-	if (cap.cd_rw_write)
+	if (cap.cd_rw_write) {
 		CDROM_CONFIG_FLAGS(drive)->cd_rw = 1;
+		CDROM_CONFIG_FLAGS(drive)->ram = 1;
+	}
 	if (cap.test_write)
 		CDROM_CONFIG_FLAGS(drive)->test_write = 1;
 	if (cap.dvd_ram_read || cap.dvd_r_read || cap.dvd_rom)
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 20a10a3ed5e4..da06adf48834 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -876,10 +876,10 @@ static void get_capabilities(struct scsi_cd *cd)
 		cd->cdi.mask |= CDC_CLOSE_TRAY; */
 
 	/*
-	 * if DVD-RAM of MRW-W, we are randomly writeable
+	 * if DVD-RAM, MRW-W or CD-RW, we are randomly writable
 	 */
-	if ((cd->cdi.mask & (CDC_DVD_RAM | CDC_MRW_W | CDC_RAM)) !=
-			(CDC_DVD_RAM | CDC_MRW_W | CDC_RAM)) {
+	if ((cd->cdi.mask & (CDC_DVD_RAM | CDC_MRW_W | CDC_RAM | CDC_CD_RW)) !=
+			(CDC_DVD_RAM | CDC_MRW_W | CDC_RAM | CDC_CD_RW)) {
 		cd->device->writeable = 1;
 	}
 
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 2edf63444ccb..5642ccc235e8 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -115,6 +115,7 @@
 #include <linux/random.h>
 #include <linux/filter.h>
 #include <linux/msdos_fs.h>
+#include <linux/pktcdvd.h>
 
 #include <linux/hiddev.h>
 
diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h
index 73a1ade6a8e9..1c1f5efffd64 100644
--- a/include/linux/cdrom.h
+++ b/include/linux/cdrom.h
@@ -499,6 +499,7 @@ struct cdrom_generic_command
 #define GPMODE_VENDOR_PAGE		0x00
 #define GPMODE_R_W_ERROR_PAGE		0x01
 #define GPMODE_WRITE_PARMS_PAGE		0x05
+#define GPMODE_WCACHING_PAGE		0x08
 #define GPMODE_AUDIO_CTL_PAGE		0x0e
 #define GPMODE_POWER_PAGE		0x1a
 #define GPMODE_FAULT_FAIL_PAGE		0x1c
diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
index 77f59742c407..71da7d1260cd 100644
--- a/include/linux/compat_ioctl.h
+++ b/include/linux/compat_ioctl.h
@@ -382,6 +382,8 @@ COMPATIBLE_IOCTL(CDROMREADALL)
 COMPATIBLE_IOCTL(DVD_READ_STRUCT)
 COMPATIBLE_IOCTL(DVD_WRITE_STRUCT)
 COMPATIBLE_IOCTL(DVD_AUTH)
+/* pktcdvd */
+COMPATIBLE_IOCTL(PACKET_CTRL_CMD)
 /* Big L */
 ULONG_IOCTL(LOOP_SET_FD)
 ULONG_IOCTL(LOOP_CHANGE_FD)
diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h
new file mode 100644
index 000000000000..4e2d2a942ecb
--- /dev/null
+++ b/include/linux/pktcdvd.h
@@ -0,0 +1,275 @@
+/*
+ * Copyright (C) 2000 Jens Axboe <axboe@suse.de>
+ * Copyright (C) 2001-2004 Peter Osterlund <petero2@telia.com>
+ *
+ * May be copied or modified under the terms of the GNU General Public
+ * License.  See linux/COPYING for more information.
+ *
+ * Packet writing layer for ATAPI and SCSI CD-R, CD-RW, DVD-R, and
+ * DVD-RW devices.
+ *
+ */
+#ifndef __PKTCDVD_H
+#define __PKTCDVD_H
+
+#include <linux/types.h>
+
+/*
+ * 1 for normal debug messages, 2 is very verbose. 0 to turn it off.
+ */
+#define PACKET_DEBUG		1
+
+#define	MAX_WRITERS		8
+
+#define PKT_RB_POOL_SIZE	512
+
+/*
+ * How long we should hold a non-full packet before starting data gathering.
+ */
+#define PACKET_WAIT_TIME	(HZ * 5 / 1000)
+
+/*
+ * use drive write caching -- we need deferred error handling to be
+ * able to sucessfully recover with this option (drive will return good
+ * status as soon as the cdb is validated).
+ */
+#if defined(CONFIG_CDROM_PKTCDVD_WCACHE)
+#define USE_WCACHING		1
+#else
+#define USE_WCACHING		0
+#endif
+
+/*
+ * No user-servicable parts beyond this point ->
+ */
+
+/*
+ * device types
+ */
+#define PACKET_CDR		1
+#define	PACKET_CDRW		2
+#define PACKET_DVDR		3
+#define PACKET_DVDRW		4
+
+/*
+ * flags
+ */
+#define PACKET_WRITABLE		1	/* pd is writable */
+#define PACKET_NWA_VALID	2	/* next writable address valid */
+#define PACKET_LRA_VALID	3	/* last recorded address valid */
+#define PACKET_MERGE_SEGS	4	/* perform segment merging to keep */
+					/* underlying cdrom device happy */
+
+/*
+ * Disc status -- from READ_DISC_INFO
+ */
+#define PACKET_DISC_EMPTY	0
+#define PACKET_DISC_INCOMPLETE	1
+#define PACKET_DISC_COMPLETE	2
+#define PACKET_DISC_OTHER	3
+
+/*
+ * write type, and corresponding data block type
+ */
+#define PACKET_MODE1		1
+#define PACKET_MODE2		2
+#define PACKET_BLOCK_MODE1	8
+#define PACKET_BLOCK_MODE2	10
+
+/*
+ * Last session/border status
+ */
+#define PACKET_SESSION_EMPTY		0
+#define PACKET_SESSION_INCOMPLETE	1
+#define PACKET_SESSION_RESERVED		2
+#define PACKET_SESSION_COMPLETE		3
+
+#define PACKET_MCN			"4a656e734178626f65323030300000"
+
+#undef PACKET_USE_LS
+
+#define PKT_CTRL_CMD_SETUP	0
+#define PKT_CTRL_CMD_TEARDOWN	1
+#define PKT_CTRL_CMD_STATUS	2
+
+struct pkt_ctrl_command {
+	__u32 command;				/* in: Setup, teardown, status */
+	__u32 dev_index;			/* in/out: Device index */
+	__u32 dev;				/* in/out: Device nr for cdrw device */
+	__u32 pkt_dev;				/* in/out: Device nr for packet device */
+	__u32 num_devices;			/* out: Largest device index + 1 */
+	__u32 padding;				/* Not used */
+};
+
+/*
+ * packet ioctls
+ */
+#define PACKET_IOCTL_MAGIC	('X')
+#define PACKET_CTRL_CMD		_IOWR(PACKET_IOCTL_MAGIC, 1, struct pkt_ctrl_command)
+
+#ifdef __KERNEL__
+#include <linux/blkdev.h>
+#include <linux/completion.h>
+#include <linux/cdrom.h>
+
+struct packet_settings
+{
+	__u8			size;		/* packet size in (512 byte) sectors */
+	__u8			fp;		/* fixed packets */
+	__u8			link_loss;	/* the rest is specified
+						 * as per Mt Fuji */
+	__u8			write_type;
+	__u8			track_mode;
+	__u8			block_mode;
+};
+
+/*
+ * Very crude stats for now
+ */
+struct packet_stats
+{
+	unsigned long		pkt_started;
+	unsigned long		pkt_ended;
+	unsigned long		secs_w;
+	unsigned long		secs_rg;
+	unsigned long		secs_r;
+};
+
+struct packet_cdrw
+{
+	struct list_head	pkt_free_list;
+	struct list_head	pkt_active_list;
+	spinlock_t		active_list_lock; /* Serialize access to pkt_active_list */
+	struct task_struct	*thread;
+	atomic_t		pending_bios;
+};
+
+/*
+ * Switch to high speed reading after reading this many kilobytes
+ * with no interspersed writes.
+ */
+#define HI_SPEED_SWITCH 512
+
+struct packet_iosched
+{
+	atomic_t		attention;	/* Set to non-zero when queue processing is needed */
+	int			writing;	/* Non-zero when writing, zero when reading */
+	spinlock_t		lock;		/* Protecting read/write queue manipulations */
+	struct bio		*read_queue;
+	struct bio		*read_queue_tail;
+	struct bio		*write_queue;
+	struct bio		*write_queue_tail;
+	int			high_prio_read;	/* An important read request has been queued */
+	int			successive_reads;
+};
+
+/*
+ * 32 buffers of 2048 bytes
+ */
+#define PACKET_MAX_SIZE		32
+#define PAGES_PER_PACKET	(PACKET_MAX_SIZE * CD_FRAMESIZE / PAGE_SIZE)
+#define PACKET_MAX_SECTORS	(PACKET_MAX_SIZE * CD_FRAMESIZE >> 9)
+
+enum packet_data_state {
+	PACKET_IDLE_STATE,			/* Not used at the moment */
+	PACKET_WAITING_STATE,			/* Waiting for more bios to arrive, so */
+						/* we don't have to do as much */
+						/* data gathering */
+	PACKET_READ_WAIT_STATE,			/* Waiting for reads to fill in holes */
+	PACKET_WRITE_WAIT_STATE,		/* Waiting for the write to complete */
+	PACKET_RECOVERY_STATE,			/* Recover after read/write errors */
+	PACKET_FINISHED_STATE,			/* After write has finished */
+
+	PACKET_NUM_STATES			/* Number of possible states */
+};
+
+/*
+ * Information needed for writing a single packet
+ */
+struct pktcdvd_device;
+
+struct packet_data
+{
+	struct list_head	list;
+
+	spinlock_t		lock;		/* Lock protecting state transitions and */
+						/* orig_bios list */
+
+	struct bio		*orig_bios;	/* Original bios passed to pkt_make_request */
+	struct bio		*orig_bios_tail;/* that will be handled by this packet */
+	int			write_size;	/* Total size of all bios in the orig_bios */
+						/* list, measured in number of frames */
+
+	struct bio		*w_bio;		/* The bio we will send to the real CD */
+						/* device once we have all data for the */
+						/* packet we are going to write */
+	sector_t		sector;		/* First sector in this packet */
+	int			frames;		/* Number of frames in this packet */
+
+	enum packet_data_state	state;		/* Current state */
+	atomic_t		run_sm;		/* Incremented whenever the state */
+						/* machine needs to be run */
+	long			sleep_time;	/* Set this to non-zero to make the state */
+						/* machine run after this many jiffies. */
+
+	atomic_t		io_wait;	/* Number of pending IO operations */
+	atomic_t		io_errors;	/* Number of read/write errors during IO */
+
+	struct bio		*r_bios[PACKET_MAX_SIZE]; /* bios to use during data gathering */
+	struct page		*pages[PAGES_PER_PACKET];
+
+	int			cache_valid;	/* If non-zero, the data for the zone defined */
+						/* by the sector variable is completely cached */
+						/* in the pages[] vector. */
+
+	int			id;		/* ID number for debugging */
+	struct pktcdvd_device	*pd;
+};
+
+struct pkt_rb_node {
+	struct rb_node		rb_node;
+	struct bio		*bio;
+};
+
+struct packet_stacked_data
+{
+	struct bio		*bio;		/* Original read request bio */
+	struct pktcdvd_device	*pd;
+};
+#define PSD_POOL_SIZE		64
+
+struct pktcdvd_device
+{
+	struct block_device	*bdev;		/* dev attached */
+	dev_t			pkt_dev;	/* our dev */
+	char			name[20];
+	struct packet_settings	settings;
+	struct packet_stats	stats;
+	int			refcnt;		/* Open count */
+	int			write_speed;	/* current write speed, kB/s */
+	int			read_speed;	/* current read speed, kB/s */
+	unsigned long		offset;		/* start offset */
+	__u8			mode_offset;	/* 0 / 8 */
+	__u8			type;
+	unsigned long		flags;
+	__u16			mmc3_profile;
+	__u32			nwa;		/* next writable address */
+	__u32			lra;		/* last recorded address */
+	struct packet_cdrw	cdrw;
+	wait_queue_head_t	wqueue;
+
+	spinlock_t		lock;		/* Serialize access to bio_queue */
+	struct rb_root		bio_queue;	/* Work queue of bios we need to handle */
+	int			bio_queue_size;	/* Number of nodes in bio_queue */
+	sector_t		current_sector;	/* Keep track of where the elevator is */
+	atomic_t		scan_queue;	/* Set to non-zero when pkt_handle_queue */
+						/* needs to be run. */
+	mempool_t		*rb_pool;	/* mempool for pkt_rb_node allocations */
+
+	struct packet_iosched   iosched;
+	struct gendisk		*disk;
+};
+
+#endif /* __KERNEL__ */
+
+#endif /* __PKTCDVD_H */
-- 
cgit v1.2.3


From df02202cfb0d7df1c28225c7da0c3deb3698a730 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 18 Oct 2004 18:01:28 -0700
Subject: [PATCH] switchable and modular io schedulers

This patch modularizes the io schedulers completely, allowing them to be
modular.  Additionally it enables online switching of io schedulers.  See
also http://lwn.net/Articles/102593/ .


There's a scheduler file in the sysfs directory for the block device
queue:

axboe@router:/sys/block/hda/queue> ls
iosched            max_sectors_kb  read_ahead_kb
max_hw_sectors_kb  nr_requests     scheduler

If you list the contents of the file, it will show available schedulers
and the active one:

axboe@router:/sys/block/hda/queue> cat scheduler
[cfq]

Lets load a few more.

router:/sys/block/hda/queue # modprobe deadline-iosched
router:/sys/block/hda/queue # modprobe as-iosched
router:/sys/block/hda/queue # cat scheduler
[cfq] deadline anticipatory

Changing is done with

router:/sys/block/hda/queue # echo deadline > scheduler
router:/sys/block/hda/queue # cat scheduler
cfq [deadline] anticipatory

deadline is now the new active io scheduler for hda.

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/Kconfig.iosched    |   8 +-
 drivers/block/as-iosched.c       | 111 ++++++++------
 drivers/block/cfq-iosched.c      | 113 +++++++++-----
 drivers/block/deadline-iosched.c |  90 ++++++-----
 drivers/block/elevator.c         | 316 ++++++++++++++++++++++++++++++++-------
 drivers/block/ll_rw_blk.c        | 140 ++++++++++-------
 drivers/block/noop-iosched.c     |  33 +++-
 drivers/s390/block/dasd.c        |   4 +-
 drivers/s390/char/tape_block.c   |   4 +-
 include/linux/blkdev.h           |  10 +-
 include/linux/elevator.h         |  55 ++++---
 11 files changed, 622 insertions(+), 262 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/Kconfig.iosched b/drivers/block/Kconfig.iosched
index d938c5fd130b..e0ba6c93717e 100644
--- a/drivers/block/Kconfig.iosched
+++ b/drivers/block/Kconfig.iosched
@@ -1,5 +1,5 @@
 config IOSCHED_NOOP
-	bool "No-op I/O scheduler" if EMBEDDED
+	bool
 	default y
 	---help---
 	  The no-op I/O scheduler is a minimal scheduler that does basic merging
@@ -9,7 +9,7 @@ config IOSCHED_NOOP
 	  the kernel.
 
 config IOSCHED_AS
-	bool "Anticipatory I/O scheduler" if EMBEDDED
+	tristate "Anticipatory I/O scheduler"
 	default y
 	---help---
 	  The anticipatory I/O scheduler is the default disk scheduler. It is
@@ -18,7 +18,7 @@ config IOSCHED_AS
 	  slower in some cases especially some database loads.
 
 config IOSCHED_DEADLINE
-	bool "Deadline I/O scheduler" if EMBEDDED
+	tristate "Deadline I/O scheduler"
 	default y
 	---help---
 	  The deadline I/O scheduler is simple and compact, and is often as
@@ -28,7 +28,7 @@ config IOSCHED_DEADLINE
 	  anticipatory I/O scheduler and so is a good choice.
 
 config IOSCHED_CFQ
-	bool "CFQ I/O scheduler" if EMBEDDED
+	tristate "CFQ I/O scheduler"
 	default y
 	---help---
 	  The CFQ I/O scheduler tries to distribute bandwidth equally
diff --git a/drivers/block/as-iosched.c b/drivers/block/as-iosched.c
index 0ef6a665d93e..bb3e9b5bab3c 100644
--- a/drivers/block/as-iosched.c
+++ b/drivers/block/as-iosched.c
@@ -614,7 +614,7 @@ static void as_antic_stop(struct as_data *ad)
 static void as_antic_timeout(unsigned long data)
 {
 	struct request_queue *q = (struct request_queue *)data;
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	unsigned long flags;
 
 	spin_lock_irqsave(q->queue_lock, flags);
@@ -945,7 +945,7 @@ static void update_write_batch(struct as_data *ad)
  */
 static void as_completed_request(request_queue_t *q, struct request *rq)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = RQ_DATA(rq);
 
 	WARN_ON(!list_empty(&rq->queuelist));
@@ -1030,7 +1030,7 @@ static void as_remove_queued_request(request_queue_t *q, struct request *rq)
 {
 	struct as_rq *arq = RQ_DATA(rq);
 	const int data_dir = arq->is_sync;
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 
 	WARN_ON(arq->state != AS_RQ_QUEUED);
 
@@ -1361,7 +1361,7 @@ fifo_expired:
 
 static struct request *as_next_request(request_queue_t *q)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct request *rq = NULL;
 
 	/*
@@ -1469,7 +1469,7 @@ static void as_add_request(struct as_data *ad, struct as_rq *arq)
  */
 static void as_requeue_request(request_queue_t *q, struct request *rq)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = RQ_DATA(rq);
 
 	if (arq) {
@@ -1509,7 +1509,7 @@ static void as_account_queued_request(struct as_data *ad, struct request *rq)
 static void
 as_insert_request(request_queue_t *q, struct request *rq, int where)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = RQ_DATA(rq);
 
 	if (arq) {
@@ -1562,7 +1562,7 @@ as_insert_request(request_queue_t *q, struct request *rq, int where)
  */
 static int as_queue_empty(request_queue_t *q)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 
 	if (!list_empty(&ad->fifo_list[REQ_ASYNC])
 		|| !list_empty(&ad->fifo_list[REQ_SYNC])
@@ -1601,7 +1601,7 @@ as_latter_request(request_queue_t *q, struct request *rq)
 static int
 as_merge(request_queue_t *q, struct request **req, struct bio *bio)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	sector_t rb_key = bio->bi_sector + bio_sectors(bio);
 	struct request *__rq;
 	int ret;
@@ -1656,7 +1656,7 @@ out_insert:
 
 static void as_merged_request(request_queue_t *q, struct request *req)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = RQ_DATA(req);
 
 	/*
@@ -1701,7 +1701,7 @@ static void
 as_merged_requests(request_queue_t *q, struct request *req,
 			 struct request *next)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = RQ_DATA(req);
 	struct as_rq *anext = RQ_DATA(next);
 
@@ -1788,7 +1788,7 @@ static void as_work_handler(void *data)
 
 static void as_put_request(request_queue_t *q, struct request *rq)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = RQ_DATA(rq);
 
 	if (!arq) {
@@ -1807,7 +1807,7 @@ static void as_put_request(request_queue_t *q, struct request *rq)
 
 static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask);
 
 	if (arq) {
@@ -1829,7 +1829,7 @@ static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 static int as_may_queue(request_queue_t *q, int rw)
 {
 	int ret = 0;
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct io_context *ioc;
 	if (ad->antic_status == ANTIC_WAIT_REQ ||
 			ad->antic_status == ANTIC_WAIT_NEXT) {
@@ -1842,7 +1842,7 @@ static int as_may_queue(request_queue_t *q, int rw)
 	return ret;
 }
 
-static void as_exit(request_queue_t *q, elevator_t *e)
+static void as_exit_queue(elevator_t *e)
 {
 	struct as_data *ad = e->elevator_data;
 
@@ -1862,7 +1862,7 @@ static void as_exit(request_queue_t *q, elevator_t *e)
  * initialize elevator private data (as_data), and alloc a arq for
  * each request on the free lists
  */
-static int as_init(request_queue_t *q, elevator_t *e)
+static int as_init_queue(request_queue_t *q, elevator_t *e)
 {
 	struct as_data *ad;
 	int i;
@@ -2070,39 +2070,64 @@ static struct kobj_type as_ktype = {
 	.default_attrs	= default_attrs,
 };
 
-static int __init as_slab_setup(void)
+static struct elevator_type iosched_as = {
+	.ops = {
+		.elevator_merge_fn = 		as_merge,
+		.elevator_merged_fn =		as_merged_request,
+		.elevator_merge_req_fn =	as_merged_requests,
+		.elevator_next_req_fn =		as_next_request,
+		.elevator_add_req_fn =		as_insert_request,
+		.elevator_remove_req_fn =	as_remove_request,
+		.elevator_requeue_req_fn = 	as_requeue_request,
+		.elevator_queue_empty_fn =	as_queue_empty,
+		.elevator_completed_req_fn =	as_completed_request,
+		.elevator_former_req_fn =	as_former_request,
+		.elevator_latter_req_fn =	as_latter_request,
+		.elevator_set_req_fn =		as_set_request,
+		.elevator_put_req_fn =		as_put_request,
+		.elevator_may_queue_fn =	as_may_queue,
+		.elevator_init_fn =		as_init_queue,
+		.elevator_exit_fn =		as_exit_queue,
+	},
+
+	.elevator_ktype = &as_ktype,
+	.elevator_name = "anticipatory",
+	.elevator_owner = THIS_MODULE,
+};
+
+int as_init(void)
 {
+	int ret;
+
 	arq_pool = kmem_cache_create("as_arq", sizeof(struct as_rq),
 				     0, 0, NULL, NULL);
-
 	if (!arq_pool)
-		panic("as: can't init slab pool\n");
+		return -ENOMEM;
 
-	return 0;
+	ret = elv_register(&iosched_as);
+	if (!ret) {
+		/*
+		 * don't allow AS to get unregistered, since we would have
+		 * to browse all tasks in the system and release their
+		 * as_io_context first
+		 */
+		__module_get(THIS_MODULE);
+		return 0;
+	}
+
+	kmem_cache_destroy(arq_pool);
+	return ret;
 }
 
-subsys_initcall(as_slab_setup);
-
-elevator_t iosched_as = {
-	.elevator_merge_fn = 		as_merge,
-	.elevator_merged_fn =		as_merged_request,
-	.elevator_merge_req_fn =	as_merged_requests,
-	.elevator_next_req_fn =		as_next_request,
-	.elevator_add_req_fn =		as_insert_request,
-	.elevator_remove_req_fn =	as_remove_request,
-	.elevator_requeue_req_fn = 	as_requeue_request,
-	.elevator_queue_empty_fn =	as_queue_empty,
-	.elevator_completed_req_fn =	as_completed_request,
-	.elevator_former_req_fn =	as_former_request,
-	.elevator_latter_req_fn =	as_latter_request,
-	.elevator_set_req_fn =		as_set_request,
-	.elevator_put_req_fn =		as_put_request,
-	.elevator_may_queue_fn =	as_may_queue,
-	.elevator_init_fn =		as_init,
-	.elevator_exit_fn =		as_exit,
-
-	.elevator_ktype =		&as_ktype,
-	.elevator_name =		"anticipatory",
-};
+void as_exit(void)
+{
+	kmem_cache_destroy(arq_pool);
+	elv_unregister(&iosched_as);
+}
+
+module_init(as_init);
+module_exit(as_exit);
 
-EXPORT_SYMBOL(iosched_as);
+MODULE_AUTHOR("Nick Piggin");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("anticipatory IO scheduler");
diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c
index 068f4eae0b5c..6a424dc65823 100644
--- a/drivers/block/cfq-iosched.c
+++ b/drivers/block/cfq-iosched.c
@@ -246,7 +246,7 @@ out:
 
 static void cfq_remove_request(request_queue_t *q, struct request *rq)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_rq *crq = RQ_DATA(rq);
 
 	if (crq) {
@@ -267,7 +267,7 @@ static void cfq_remove_request(request_queue_t *q, struct request *rq)
 static int
 cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct request *__rq;
 	int ret;
 
@@ -305,7 +305,7 @@ out_insert:
 
 static void cfq_merged_request(request_queue_t *q, struct request *req)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_rq *crq = RQ_DATA(req);
 
 	cfq_del_crq_hash(crq);
@@ -404,7 +404,7 @@ restart:
 
 static struct request *cfq_next_request(request_queue_t *q)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct request *rq;
 
 	if (!list_empty(cfqd->dispatch)) {
@@ -531,7 +531,7 @@ static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq)
 static void
 cfq_insert_request(request_queue_t *q, struct request *rq, int where)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_rq *crq = RQ_DATA(rq);
 
 	switch (where) {
@@ -562,7 +562,7 @@ cfq_insert_request(request_queue_t *q, struct request *rq, int where)
 
 static int cfq_queue_empty(request_queue_t *q)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 
 	if (list_empty(cfqd->dispatch) && list_empty(&cfqd->rr_list))
 		return 1;
@@ -596,7 +596,7 @@ cfq_latter_request(request_queue_t *q, struct request *rq)
 
 static int cfq_may_queue(request_queue_t *q, int rw)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq;
 	int ret = 1;
 
@@ -621,7 +621,7 @@ out:
 
 static void cfq_put_request(request_queue_t *q, struct request *rq)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_rq *crq = RQ_DATA(rq);
 	struct request_list *rl;
 	int other_rw;
@@ -654,7 +654,7 @@ static void cfq_put_request(request_queue_t *q, struct request *rq)
 
 static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq;
 	struct cfq_rq *crq;
 
@@ -679,7 +679,7 @@ static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 	return 1;
 }
 
-static void cfq_exit(request_queue_t *q, elevator_t *e)
+static void cfq_exit_queue(elevator_t *e)
 {
 	struct cfq_data *cfqd = e->elevator_data;
 
@@ -690,7 +690,7 @@ static void cfq_exit(request_queue_t *q, elevator_t *e)
 	kfree(cfqd);
 }
 
-static int cfq_init(request_queue_t *q, elevator_t *e)
+static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 {
 	struct cfq_data *cfqd;
 	int i;
@@ -732,7 +732,6 @@ static int cfq_init(request_queue_t *q, elevator_t *e)
 
 	cfqd->cfq_queued = cfq_queued;
 	cfqd->cfq_quantum = cfq_quantum;
-
 	return 0;
 out_crqpool:
 	kfree(cfqd->cfq_hash);
@@ -743,30 +742,38 @@ out_crqhash:
 	return -ENOMEM;
 }
 
-static int __init cfq_slab_setup(void)
+static void cfq_slab_kill(void)
+{
+	if (crq_pool)
+		kmem_cache_destroy(crq_pool);
+	if (cfq_mpool)
+		mempool_destroy(cfq_mpool);
+	if (cfq_pool)
+		kmem_cache_destroy(cfq_pool);
+}
+
+static int cfq_slab_setup(void)
 {
 	crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
 					NULL, NULL);
-
 	if (!crq_pool)
-		panic("cfq_iosched: can't init crq pool\n");
+		goto fail;
 
 	cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
 					NULL, NULL);
-
 	if (!cfq_pool)
-		panic("cfq_iosched: can't init cfq pool\n");
+		goto fail;
 
 	cfq_mpool = mempool_create(64, mempool_alloc_slab, mempool_free_slab, cfq_pool);
-
 	if (!cfq_mpool)
-		panic("cfq_iosched: can't init cfq mpool\n");
+		goto fail;
 
 	return 0;
+fail:
+	cfq_slab_kill();
+	return -ENOMEM;
 }
 
-subsys_initcall(cfq_slab_setup);
-
 /*
  * sysfs parts below -->
  */
@@ -868,23 +875,51 @@ struct kobj_type cfq_ktype = {
 	.default_attrs	= default_attrs,
 };
 
-elevator_t iosched_cfq = {
-	.elevator_name =		"cfq",
-	.elevator_ktype =		&cfq_ktype,
-	.elevator_merge_fn = 		cfq_merge,
-	.elevator_merged_fn =		cfq_merged_request,
-	.elevator_merge_req_fn =	cfq_merged_requests,
-	.elevator_next_req_fn =		cfq_next_request,
-	.elevator_add_req_fn =		cfq_insert_request,
-	.elevator_remove_req_fn =	cfq_remove_request,
-	.elevator_queue_empty_fn =	cfq_queue_empty,
-	.elevator_former_req_fn =	cfq_former_request,
-	.elevator_latter_req_fn =	cfq_latter_request,
-	.elevator_set_req_fn =		cfq_set_request,
-	.elevator_put_req_fn =		cfq_put_request,
-	.elevator_may_queue_fn =	cfq_may_queue,
-	.elevator_init_fn =		cfq_init,
-	.elevator_exit_fn =		cfq_exit,
+static struct elevator_type iosched_cfq = {
+	.ops = {
+		.elevator_merge_fn = 		cfq_merge,
+		.elevator_merged_fn =		cfq_merged_request,
+		.elevator_merge_req_fn =	cfq_merged_requests,
+		.elevator_next_req_fn =		cfq_next_request,
+		.elevator_add_req_fn =		cfq_insert_request,
+		.elevator_remove_req_fn =	cfq_remove_request,
+		.elevator_queue_empty_fn =	cfq_queue_empty,
+		.elevator_former_req_fn =	cfq_former_request,
+		.elevator_latter_req_fn =	cfq_latter_request,
+		.elevator_set_req_fn =		cfq_set_request,
+		.elevator_put_req_fn =		cfq_put_request,
+		.elevator_may_queue_fn =	cfq_may_queue,
+		.elevator_init_fn =		cfq_init_queue,
+		.elevator_exit_fn =		cfq_exit_queue,
+	},
+	.elevator_ktype = &cfq_ktype,
+	.elevator_name = "cfq",
+	.elevator_owner = THIS_MODULE,
 };
 
-EXPORT_SYMBOL(iosched_cfq);
+int cfq_init(void)
+{
+	int ret;
+
+	if (cfq_slab_setup())
+		return -ENOMEM;
+
+	ret = elv_register(&iosched_cfq);
+	if (ret)
+		cfq_slab_kill();
+
+	return ret;
+}
+
+void cfq_exit(void)
+{
+	cfq_slab_kill();
+	elv_unregister(&iosched_cfq);
+}
+
+module_init(cfq_init);
+module_exit(cfq_exit);
+
+MODULE_AUTHOR("Jens Axboe");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
diff --git a/drivers/block/deadline-iosched.c b/drivers/block/deadline-iosched.c
index fb7ab733c709..0d3e2411f1d3 100644
--- a/drivers/block/deadline-iosched.c
+++ b/drivers/block/deadline-iosched.c
@@ -289,7 +289,7 @@ deadline_find_first_drq(struct deadline_data *dd, int data_dir)
 static inline void
 deadline_add_request(struct request_queue *q, struct request *rq)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	struct deadline_rq *drq = RQ_DATA(rq);
 
 	const int data_dir = rq_data_dir(drq->request);
@@ -317,7 +317,7 @@ static void deadline_remove_request(request_queue_t *q, struct request *rq)
 	struct deadline_rq *drq = RQ_DATA(rq);
 
 	if (drq) {
-		struct deadline_data *dd = q->elevator.elevator_data;
+		struct deadline_data *dd = q->elevator->elevator_data;
 
 		list_del_init(&drq->fifo);
 		deadline_remove_merge_hints(q, drq);
@@ -328,7 +328,7 @@ static void deadline_remove_request(request_queue_t *q, struct request *rq)
 static int
 deadline_merge(request_queue_t *q, struct request **req, struct bio *bio)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	struct request *__rq;
 	int ret;
 
@@ -383,7 +383,7 @@ out_insert:
 
 static void deadline_merged_request(request_queue_t *q, struct request *req)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	struct deadline_rq *drq = RQ_DATA(req);
 
 	/*
@@ -407,7 +407,7 @@ static void
 deadline_merged_requests(request_queue_t *q, struct request *req,
 			 struct request *next)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	struct deadline_rq *drq = RQ_DATA(req);
 	struct deadline_rq *dnext = RQ_DATA(next);
 
@@ -604,7 +604,7 @@ dispatch_request:
 
 static struct request *deadline_next_request(request_queue_t *q)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	struct request *rq;
 
 	/*
@@ -625,7 +625,7 @@ dispatch:
 static void
 deadline_insert_request(request_queue_t *q, struct request *rq, int where)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 
 	/* barriers must flush the reorder queue */
 	if (unlikely(rq->flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)
@@ -653,7 +653,7 @@ deadline_insert_request(request_queue_t *q, struct request *rq, int where)
 
 static int deadline_queue_empty(request_queue_t *q)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 
 	if (!list_empty(&dd->fifo_list[WRITE])
 	    || !list_empty(&dd->fifo_list[READ])
@@ -687,7 +687,7 @@ deadline_latter_request(request_queue_t *q, struct request *rq)
 	return NULL;
 }
 
-static void deadline_exit(request_queue_t *q, elevator_t *e)
+static void deadline_exit_queue(elevator_t *e)
 {
 	struct deadline_data *dd = e->elevator_data;
 
@@ -703,7 +703,7 @@ static void deadline_exit(request_queue_t *q, elevator_t *e)
  * initialize elevator private data (deadline_data), and alloc a drq for
  * each request on the free lists
  */
-static int deadline_init(request_queue_t *q, elevator_t *e)
+static int deadline_init_queue(request_queue_t *q, elevator_t *e)
 {
 	struct deadline_data *dd;
 	int i;
@@ -748,7 +748,7 @@ static int deadline_init(request_queue_t *q, elevator_t *e)
 
 static void deadline_put_request(request_queue_t *q, struct request *rq)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	struct deadline_rq *drq = RQ_DATA(rq);
 
 	if (drq) {
@@ -760,7 +760,7 @@ static void deadline_put_request(request_queue_t *q, struct request *rq)
 static int
 deadline_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	struct deadline_rq *drq;
 
 	drq = mempool_alloc(dd->drq_pool, gfp_mask);
@@ -906,36 +906,54 @@ struct kobj_type deadline_ktype = {
 	.default_attrs	= default_attrs,
 };
 
-static int __init deadline_slab_setup(void)
+static struct elevator_type iosched_deadline = {
+	.ops = {
+		.elevator_merge_fn = 		deadline_merge,
+		.elevator_merged_fn =		deadline_merged_request,
+		.elevator_merge_req_fn =	deadline_merged_requests,
+		.elevator_next_req_fn =		deadline_next_request,
+		.elevator_add_req_fn =		deadline_insert_request,
+		.elevator_remove_req_fn =	deadline_remove_request,
+		.elevator_queue_empty_fn =	deadline_queue_empty,
+		.elevator_former_req_fn =	deadline_former_request,
+		.elevator_latter_req_fn =	deadline_latter_request,
+		.elevator_set_req_fn =		deadline_set_request,
+		.elevator_put_req_fn = 		deadline_put_request,
+		.elevator_init_fn =		deadline_init_queue,
+		.elevator_exit_fn =		deadline_exit_queue,
+	},
+
+	.elevator_ktype = &deadline_ktype,
+	.elevator_name = "deadline",
+	.elevator_owner = THIS_MODULE,
+};
+
+int deadline_init(void)
 {
+	int ret;
+
 	drq_pool = kmem_cache_create("deadline_drq", sizeof(struct deadline_rq),
 				     0, 0, NULL, NULL);
 
 	if (!drq_pool)
-		panic("deadline: can't init slab pool\n");
+		return -ENOMEM;
 
-	return 0;
+	ret = elv_register(&iosched_deadline);
+	if (ret)
+		kmem_cache_destroy(drq_pool);
+
+	return ret;
 }
 
-subsys_initcall(deadline_slab_setup);
-
-elevator_t iosched_deadline = {
-	.elevator_merge_fn = 		deadline_merge,
-	.elevator_merged_fn =		deadline_merged_request,
-	.elevator_merge_req_fn =	deadline_merged_requests,
-	.elevator_next_req_fn =		deadline_next_request,
-	.elevator_add_req_fn =		deadline_insert_request,
-	.elevator_remove_req_fn =	deadline_remove_request,
-	.elevator_queue_empty_fn =	deadline_queue_empty,
-	.elevator_former_req_fn =	deadline_former_request,
-	.elevator_latter_req_fn =	deadline_latter_request,
-	.elevator_set_req_fn =		deadline_set_request,
-	.elevator_put_req_fn = 		deadline_put_request,
-	.elevator_init_fn =		deadline_init,
-	.elevator_exit_fn =		deadline_exit,
-
-	.elevator_ktype =		&deadline_ktype,
-	.elevator_name =		"deadline",
-};
+void deadline_exit(void)
+{
+	kmem_cache_destroy(drq_pool);
+	elv_unregister(&iosched_deadline);
+}
+
+module_init(deadline_init);
+module_exit(deadline_exit);
 
-EXPORT_SYMBOL(iosched_deadline);
+MODULE_AUTHOR("Jens Axboe");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("deadline IO scheduler");
diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c
index 35c9385ac133..92cc7a9a5c63 100644
--- a/drivers/block/elevator.c
+++ b/drivers/block/elevator.c
@@ -37,6 +37,9 @@
 
 #include <asm/uaccess.h>
 
+static spinlock_t elv_list_lock = SPIN_LOCK_UNLOCKED;
+static LIST_HEAD(elv_list);
+
 /*
  * can we safely merge with this request?
  */
@@ -60,6 +63,7 @@ inline int elv_rq_merge_ok(struct request *rq, struct bio *bio)
 
 	return 0;
 }
+EXPORT_SYMBOL(elv_rq_merge_ok);
 
 inline int elv_try_merge(struct request *__rq, struct bio *bio)
 {
@@ -77,6 +81,7 @@ inline int elv_try_merge(struct request *__rq, struct bio *bio)
 
 	return ret;
 }
+EXPORT_SYMBOL(elv_try_merge);
 
 inline int elv_try_last_merge(request_queue_t *q, struct bio *bio)
 {
@@ -85,31 +90,117 @@ inline int elv_try_last_merge(request_queue_t *q, struct bio *bio)
 
 	return ELEVATOR_NO_MERGE;
 }
+EXPORT_SYMBOL(elv_try_last_merge);
 
-/*
- * general block -> elevator interface starts here
- */
-int elevator_init(request_queue_t *q, elevator_t *type)
+struct elevator_type *elevator_find(const char *name)
+{
+	struct elevator_type *e = NULL;
+	struct list_head *entry;
+
+	spin_lock_irq(&elv_list_lock);
+	list_for_each(entry, &elv_list) {
+		struct elevator_type *__e;
+
+		__e = list_entry(entry, struct elevator_type, list);
+
+		if (!strcmp(__e->elevator_name, name)) {
+			e = __e;
+			break;
+		}
+	}
+	spin_unlock_irq(&elv_list_lock);
+
+	return e;
+}
+
+static int elevator_attach(request_queue_t *q, struct elevator_type *e,
+			   struct elevator_queue *eq)
 {
-	elevator_t *e = &q->elevator;
+	int ret = 0;
 
-	memcpy(e, type, sizeof(*e));
+	if (!try_module_get(e->elevator_owner))
+		return -EINVAL;
+
+	memset(eq, 0, sizeof(*eq));
+	eq->ops = &e->ops;
+	eq->elevator_type = e;
 
 	INIT_LIST_HEAD(&q->queue_head);
 	q->last_merge = NULL;
+	q->elevator = eq;
+
+	if (eq->ops->elevator_init_fn)
+		ret = eq->ops->elevator_init_fn(q, eq);
 
-	if (e->elevator_init_fn)
-		return e->elevator_init_fn(q, e);
+	return ret;
+}
+
+static char chosen_elevator[16];
+
+static void elevator_setup_default(void)
+{
+	/*
+	 * check if default is set and exists
+	 */
+	if (chosen_elevator[0] && elevator_find(chosen_elevator))
+		return;
+
+#if defined(CONFIG_IOSCHED_AS)
+	strcpy(chosen_elevator, "anticipatory");
+#elif defined(CONFIG_IOSCHED_DEADLINE)
+	strcpy(chosen_elevator, "deadline");
+#elif defined(CONFIG_IOSCHED_CFQ)
+	strcpy(chosen_elevator, "cfq");
+#elif defined(CONFIG_IOSCHED_NOOP)
+	strcpy(chosen_elevator, "noop");
+#else
+#error "You must build at least 1 IO scheduler into the kernel"
+#endif
+	printk("elevator: using %s as default io scheduler\n", chosen_elevator);
+}
 
+static int __init elevator_setup(char *str)
+{
+	strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
 	return 0;
 }
 
-void elevator_exit(request_queue_t *q)
+__setup("elevator=", elevator_setup);
+
+int elevator_init(request_queue_t *q, char *name)
+{
+	struct elevator_type *e = NULL;
+	struct elevator_queue *eq;
+	int ret = 0;
+
+	elevator_setup_default();
+
+	if (!name)
+		name = chosen_elevator;
+
+	e = elevator_find(name);
+	if (!e)
+		return -EINVAL;
+
+	eq = kmalloc(sizeof(struct elevator_queue), GFP_KERNEL);
+	if (!eq)
+		return -ENOMEM;
+
+	ret = elevator_attach(q, e, eq);
+	if (ret)
+		kfree(eq);
+
+	return ret;
+}
+
+void elevator_exit(elevator_t *e)
 {
-	elevator_t *e = &q->elevator;
+	if (e->ops->elevator_exit_fn)
+		e->ops->elevator_exit_fn(e);
 
-	if (e->elevator_exit_fn)
-		e->elevator_exit_fn(q, e);
+	module_put(e->elevator_type->elevator_owner);
+	e->elevator_type = NULL;
+	kfree(e);
 }
 
 int elevator_global_init(void)
@@ -119,32 +210,32 @@ int elevator_global_init(void)
 
 int elv_merge(request_queue_t *q, struct request **req, struct bio *bio)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
-	if (e->elevator_merge_fn)
-		return e->elevator_merge_fn(q, req, bio);
+	if (e->ops->elevator_merge_fn)
+		return e->ops->elevator_merge_fn(q, req, bio);
 
 	return ELEVATOR_NO_MERGE;
 }
 
 void elv_merged_request(request_queue_t *q, struct request *rq)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
-	if (e->elevator_merged_fn)
-		e->elevator_merged_fn(q, rq);
+	if (e->ops->elevator_merged_fn)
+		e->ops->elevator_merged_fn(q, rq);
 }
 
 void elv_merge_requests(request_queue_t *q, struct request *rq,
 			     struct request *next)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
 	if (q->last_merge == next)
 		q->last_merge = NULL;
 
-	if (e->elevator_merge_req_fn)
-		e->elevator_merge_req_fn(q, rq, next);
+	if (e->ops->elevator_merge_req_fn)
+		e->ops->elevator_merge_req_fn(q, rq, next);
 }
 
 void elv_requeue_request(request_queue_t *q, struct request *rq)
@@ -160,8 +251,8 @@ void elv_requeue_request(request_queue_t *q, struct request *rq)
 	 * if iosched has an explicit requeue hook, then use that. otherwise
 	 * just put the request at the front of the queue
 	 */
-	if (q->elevator.elevator_requeue_req_fn)
-		q->elevator.elevator_requeue_req_fn(q, rq);
+	if (q->elevator->ops->elevator_requeue_req_fn)
+		q->elevator->ops->elevator_requeue_req_fn(q, rq);
 	else
 		__elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0);
 }
@@ -180,7 +271,7 @@ void __elv_add_request(request_queue_t *q, struct request *rq, int where,
 		blk_plug_device(q);
 
 	rq->q = q;
-	q->elevator.elevator_add_req_fn(q, rq, where);
+	q->elevator->ops->elevator_add_req_fn(q, rq, where);
 
 	if (blk_queue_plugged(q)) {
 		int nrq = q->rq.count[READ] + q->rq.count[WRITE] - q->in_flight;
@@ -203,7 +294,7 @@ void elv_add_request(request_queue_t *q, struct request *rq, int where,
 
 static inline struct request *__elv_next_request(request_queue_t *q)
 {
-	return q->elevator.elevator_next_req_fn(q);
+	return q->elevator->ops->elevator_next_req_fn(q);
 }
 
 struct request *elv_next_request(request_queue_t *q)
@@ -252,7 +343,7 @@ struct request *elv_next_request(request_queue_t *q)
 
 void elv_remove_request(request_queue_t *q, struct request *rq)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
 	/*
 	 * the time frame between a request being removed from the lists
@@ -274,16 +365,16 @@ void elv_remove_request(request_queue_t *q, struct request *rq)
 	if (rq == q->last_merge)
 		q->last_merge = NULL;
 
-	if (e->elevator_remove_req_fn)
-		e->elevator_remove_req_fn(q, rq);
+	if (e->ops->elevator_remove_req_fn)
+		e->ops->elevator_remove_req_fn(q, rq);
 }
 
 int elv_queue_empty(request_queue_t *q)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
-	if (e->elevator_queue_empty_fn)
-		return e->elevator_queue_empty_fn(q);
+	if (e->ops->elevator_queue_empty_fn)
+		return e->ops->elevator_queue_empty_fn(q);
 
 	return list_empty(&q->queue_head);
 }
@@ -292,10 +383,10 @@ struct request *elv_latter_request(request_queue_t *q, struct request *rq)
 {
 	struct list_head *next;
 
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
-	if (e->elevator_latter_req_fn)
-		return e->elevator_latter_req_fn(q, rq);
+	if (e->ops->elevator_latter_req_fn)
+		return e->ops->elevator_latter_req_fn(q, rq);
 
 	next = rq->queuelist.next;
 	if (next != &q->queue_head && next != &rq->queuelist)
@@ -308,10 +399,10 @@ struct request *elv_former_request(request_queue_t *q, struct request *rq)
 {
 	struct list_head *prev;
 
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
-	if (e->elevator_former_req_fn)
-		return e->elevator_former_req_fn(q, rq);
+	if (e->ops->elevator_former_req_fn)
+		return e->ops->elevator_former_req_fn(q, rq);
 
 	prev = rq->queuelist.prev;
 	if (prev != &q->queue_head && prev != &rq->queuelist)
@@ -322,10 +413,10 @@ struct request *elv_former_request(request_queue_t *q, struct request *rq)
 
 int elv_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
-	if (e->elevator_set_req_fn)
-		return e->elevator_set_req_fn(q, rq, gfp_mask);
+	if (e->ops->elevator_set_req_fn)
+		return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
 
 	rq->elevator_private = NULL;
 	return 0;
@@ -333,25 +424,25 @@ int elv_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 
 void elv_put_request(request_queue_t *q, struct request *rq)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
-	if (e->elevator_put_req_fn)
-		e->elevator_put_req_fn(q, rq);
+	if (e->ops->elevator_put_req_fn)
+		e->ops->elevator_put_req_fn(q, rq);
 }
 
 int elv_may_queue(request_queue_t *q, int rw)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
-	if (e->elevator_may_queue_fn)
-		return e->elevator_may_queue_fn(q, rw);
+	if (e->ops->elevator_may_queue_fn)
+		return e->ops->elevator_may_queue_fn(q, rw);
 
 	return 0;
 }
 
 void elv_completed_request(request_queue_t *q, struct request *rq)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
 	/*
 	 * request is released from the driver, io must be done
@@ -359,22 +450,20 @@ void elv_completed_request(request_queue_t *q, struct request *rq)
 	if (blk_account_rq(rq))
 		q->in_flight--;
 
-	if (e->elevator_completed_req_fn)
-		e->elevator_completed_req_fn(q, rq);
+	if (e->ops->elevator_completed_req_fn)
+		e->ops->elevator_completed_req_fn(q, rq);
 }
 
 int elv_register_queue(struct request_queue *q)
 {
-	elevator_t *e;
-
-	e = &q->elevator;
+	elevator_t *e = q->elevator;
 
 	e->kobj.parent = kobject_get(&q->kobj);
 	if (!e->kobj.parent)
 		return -EBUSY;
 
 	snprintf(e->kobj.name, KOBJ_NAME_LEN, "%s", "iosched");
-	e->kobj.ktype = e->elevator_ktype;
+	e->kobj.ktype = e->elevator_type->elevator_ktype;
 
 	return kobject_register(&e->kobj);
 }
@@ -382,12 +471,131 @@ int elv_register_queue(struct request_queue *q)
 void elv_unregister_queue(struct request_queue *q)
 {
 	if (q) {
-		elevator_t * e = &q->elevator;
+		elevator_t *e = q->elevator;
 		kobject_unregister(&e->kobj);
 		kobject_put(&q->kobj);
 	}
 }
 
+int elv_register(struct elevator_type *e)
+{
+	if (elevator_find(e->elevator_name))
+		BUG();
+
+	spin_lock_irq(&elv_list_lock);
+	list_add_tail(&e->list, &elv_list);
+	spin_unlock_irq(&elv_list_lock);
+
+	printk("io scheduler %s registered\n", e->elevator_name);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(elv_register);
+
+void elv_unregister(struct elevator_type *e)
+{
+	spin_lock_irq(&elv_list_lock);
+	list_del_init(&e->list);
+	spin_unlock_irq(&elv_list_lock);
+}
+EXPORT_SYMBOL_GPL(elv_unregister);
+
+/*
+ * switch to new_e io scheduler. be careful not to introduce deadlocks -
+ * we don't free the old io scheduler, before we have allocated what we
+ * need for the new one. this way we have a chance of going back to the old
+ * one, if the new one fails init for some reason
+ */
+static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
+{
+	elevator_t *e = kmalloc(sizeof(elevator_t), GFP_KERNEL);
+	elevator_t *old_elevator;
+
+	if (!e) {
+		printk("elevator: out of memory\n");
+		return;
+	}
+
+	blk_wait_queue_drained(q);
+
+	/*
+	 * unregister old elevator data
+	 */
+	elv_unregister_queue(q);
+	old_elevator = q->elevator;
+
+	/*
+	 * attach and start new elevator
+	 */
+	if (elevator_attach(q, new_e, e))
+		goto fail;
+
+	if (elv_register_queue(q))
+		goto fail_register;
+
+	/*
+	 * finally exit old elevator and start queue again
+	 */
+	elevator_exit(old_elevator);
+	blk_finish_queue_drain(q);
+	return;
+
+fail_register:
+	/*
+	 * switch failed, exit the new io scheduler and reattach the old
+	 * one again (along with re-adding the sysfs dir)
+	 */
+	elevator_exit(e);
+fail:
+	q->elevator = old_elevator;
+	elv_register_queue(q);
+	blk_finish_queue_drain(q);
+	printk("elevator: switch to %s failed\n", new_e->elevator_name);
+}
+
+ssize_t elv_iosched_store(request_queue_t *q, const char *name, size_t count)
+{
+	char elevator_name[ELV_NAME_MAX];
+	struct elevator_type *e;
+
+	memset(elevator_name, 0, sizeof(elevator_name));
+	strncpy(elevator_name, name, sizeof(elevator_name));
+
+	if (elevator_name[strlen(elevator_name) - 1] == '\n')
+		elevator_name[strlen(elevator_name) - 1] = '\0';
+
+	e = elevator_find(elevator_name);
+	if (!e) {
+		printk("elevator: type %s not found\n", elevator_name);
+		return -EINVAL;
+	}
+
+	elevator_switch(q, e);
+	return count;
+}
+
+ssize_t elv_iosched_show(request_queue_t *q, char *name)
+{
+	elevator_t *e = q->elevator;
+	struct elevator_type *elv = e->elevator_type;
+	struct list_head *entry;
+	int len = 0;
+
+	spin_lock_irq(q->queue_lock);
+	list_for_each(entry, &elv_list) {
+		struct elevator_type *__e;
+
+		__e = list_entry(entry, struct elevator_type, list);
+		if (!strcmp(elv->elevator_name, __e->elevator_name))
+			len += sprintf(name+len, "[%s] ", elv->elevator_name);
+		else
+			len += sprintf(name+len, "%s ", __e->elevator_name);
+	}
+	spin_unlock_irq(q->queue_lock);
+
+	len += sprintf(len+name, "\n");
+	return len;
+}
+
 module_init(elevator_global_init);
 
 EXPORT_SYMBOL(elv_add_request);
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 26fdf6be6bd0..b3780ca0fdc0 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -1395,7 +1395,8 @@ void blk_cleanup_queue(request_queue_t * q)
 	if (!atomic_dec_and_test(&q->refcnt))
 		return;
 
-	elevator_exit(q);
+	if (q->elevator)
+		elevator_exit(q->elevator);
 
 	del_timer_sync(&q->unplug_timer);
 	kblockd_flush();
@@ -1418,6 +1419,7 @@ static int blk_init_free_list(request_queue_t *q)
 	rl->count[READ] = rl->count[WRITE] = 0;
 	init_waitqueue_head(&rl->wait[READ]);
 	init_waitqueue_head(&rl->wait[WRITE]);
+	init_waitqueue_head(&rl->drain);
 
 	rl->rq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, request_cachep);
 
@@ -1429,45 +1431,6 @@ static int blk_init_free_list(request_queue_t *q)
 
 static int __make_request(request_queue_t *, struct bio *);
 
-static elevator_t *chosen_elevator =
-#if defined(CONFIG_IOSCHED_AS)
-	&iosched_as;
-#elif defined(CONFIG_IOSCHED_DEADLINE)
-	&iosched_deadline;
-#elif defined(CONFIG_IOSCHED_CFQ)
-	&iosched_cfq;
-#elif defined(CONFIG_IOSCHED_NOOP)
-	&elevator_noop;
-#else
-	NULL;
-#error "You must have at least 1 I/O scheduler selected"
-#endif
-
-#if defined(CONFIG_IOSCHED_AS) || defined(CONFIG_IOSCHED_DEADLINE) || defined (CONFIG_IOSCHED_NOOP)
-static int __init elevator_setup(char *str)
-{
-#ifdef CONFIG_IOSCHED_DEADLINE
-	if (!strcmp(str, "deadline"))
-		chosen_elevator = &iosched_deadline;
-#endif
-#ifdef CONFIG_IOSCHED_AS
-	if (!strcmp(str, "as"))
-		chosen_elevator = &iosched_as;
-#endif
-#ifdef CONFIG_IOSCHED_CFQ
-	if (!strcmp(str, "cfq"))
-		chosen_elevator = &iosched_cfq;
-#endif
-#ifdef CONFIG_IOSCHED_NOOP
-	if (!strcmp(str, "noop"))
-		chosen_elevator = &elevator_noop;
-#endif
-	return 1;
-}
-
-__setup("elevator=", elevator_setup);
-#endif /* CONFIG_IOSCHED_AS || CONFIG_IOSCHED_DEADLINE || CONFIG_IOSCHED_NOOP */
-
 request_queue_t *blk_alloc_queue(int gfp_mask)
 {
 	request_queue_t *q = kmem_cache_alloc(requestq_cachep, gfp_mask);
@@ -1520,21 +1483,14 @@ EXPORT_SYMBOL(blk_alloc_queue);
  **/
 request_queue_t *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
 {
-	request_queue_t *q;
-	static int printed;
+	request_queue_t *q = blk_alloc_queue(GFP_KERNEL);
 
-	q = blk_alloc_queue(GFP_KERNEL);
 	if (!q)
 		return NULL;
 
 	if (blk_init_free_list(q))
 		goto out_init;
 
-	if (!printed) {
-		printed = 1;
-		printk("Using %s io scheduler\n", chosen_elevator->elevator_name);
-	}
-
 	q->request_fn		= rfn;
 	q->back_merge_fn       	= ll_back_merge_fn;
 	q->front_merge_fn      	= ll_front_merge_fn;
@@ -1555,7 +1511,7 @@ request_queue_t *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
 	/*
 	 * all done
 	 */
-	if (!elevator_init(q, chosen_elevator))
+	if (!elevator_init(q, NULL))
 		return q;
 
 	blk_cleanup_queue(q);
@@ -1649,6 +1605,9 @@ static void freed_request(request_queue_t *q, int rw)
 		if (!waitqueue_active(&rl->wait[rw]))
 			blk_clear_queue_full(q, rw);
 	}
+	if (unlikely(waitqueue_active(&rl->drain)) &&
+	    !rl->count[READ] && !rl->count[WRITE])
+		wake_up(&rl->drain);
 }
 
 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
@@ -1661,6 +1620,9 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
 	struct request_list *rl = &q->rq;
 	struct io_context *ioc = get_io_context(gfp_mask);
 
+	if (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags)))
+		return NULL;
+
 	spin_lock_irq(q->queue_lock);
 	if (rl->count[rw]+1 >= q->nr_requests) {
 		/*
@@ -2506,6 +2468,70 @@ static inline void blk_partition_remap(struct bio *bio)
 	}
 }
 
+void blk_finish_queue_drain(request_queue_t *q)
+{
+	struct request_list *rl = &q->rq;
+
+	clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
+	wake_up(&rl->wait[0]);
+	wake_up(&rl->wait[1]);
+	wake_up(&rl->drain);
+}
+
+/*
+ * We rely on the fact that only requests allocated through blk_alloc_request()
+ * have io scheduler private data structures associated with them. Any other
+ * type of request (allocated on stack or through kmalloc()) should not go
+ * to the io scheduler core, but be attached to the queue head instead.
+ */
+void blk_wait_queue_drained(request_queue_t *q)
+{
+	struct request_list *rl = &q->rq;
+	DEFINE_WAIT(wait);
+
+	spin_lock_irq(q->queue_lock);
+	set_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
+
+	while (rl->count[READ] || rl->count[WRITE]) {
+		prepare_to_wait(&rl->drain, &wait, TASK_UNINTERRUPTIBLE);
+
+		if (rl->count[READ] || rl->count[WRITE]) {
+			__generic_unplug_device(q);
+			spin_unlock_irq(q->queue_lock);
+			io_schedule();
+			spin_lock_irq(q->queue_lock);
+		}
+
+		finish_wait(&rl->drain, &wait);
+	}
+
+	spin_unlock_irq(q->queue_lock);
+}
+
+/*
+ * block waiting for the io scheduler being started again.
+ */
+static inline void block_wait_queue_running(request_queue_t *q)
+{
+	DEFINE_WAIT(wait);
+
+	while (test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags)) {
+		struct request_list *rl = &q->rq;
+
+		prepare_to_wait_exclusive(&rl->drain, &wait,
+				TASK_UNINTERRUPTIBLE);
+
+		/*
+		 * re-check the condition. avoids using prepare_to_wait()
+		 * in the fast path (queue is running)
+		 */
+		if (test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))
+			io_schedule();
+
+		finish_wait(&rl->drain, &wait);
+	}
+}
+
 /**
  * generic_make_request: hand a buffer to its device driver for I/O
  * @bio:  The bio describing the location in memory and on the device.
@@ -2595,6 +2621,8 @@ end_io:
 		if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))
 			goto end_io;
 
+		block_wait_queue_running(q);
+
 		/*
 		 * If this device has partitions, remap block n
 		 * of partition p to block n+start(p) of the disk.
@@ -3018,6 +3046,7 @@ void kblockd_flush(void)
 {
 	flush_workqueue(kblockd_workqueue);
 }
+EXPORT_SYMBOL(kblockd_flush);
 
 int __init blk_dev_init(void)
 {
@@ -3036,6 +3065,7 @@ int __init blk_dev_init(void)
 
 	blk_max_low_pfn = max_low_pfn;
 	blk_max_pfn = max_pfn;
+
 	return 0;
 }
 
@@ -3055,6 +3085,7 @@ void put_io_context(struct io_context *ioc)
 		kmem_cache_free(iocontext_cachep, ioc);
 	}
 }
+EXPORT_SYMBOL(put_io_context);
 
 /* Called by the exitting task */
 void exit_io_context(void)
@@ -3106,6 +3137,7 @@ struct io_context *get_io_context(int gfp_flags)
 	local_irq_restore(flags);
 	return ret;
 }
+EXPORT_SYMBOL(get_io_context);
 
 void copy_io_context(struct io_context **pdst, struct io_context **psrc)
 {
@@ -3119,6 +3151,7 @@ void copy_io_context(struct io_context **pdst, struct io_context **psrc)
 		*pdst = src;
 	}
 }
+EXPORT_SYMBOL(copy_io_context);
 
 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2)
 {
@@ -3127,7 +3160,7 @@ void swap_io_context(struct io_context **ioc1, struct io_context **ioc2)
 	*ioc1 = *ioc2;
 	*ioc2 = temp;
 }
-
+EXPORT_SYMBOL(swap_io_context);
 
 /*
  * sysfs parts below
@@ -3285,11 +3318,18 @@ static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
 	.show = queue_max_hw_sectors_show,
 };
 
+static struct queue_sysfs_entry queue_iosched_entry = {
+	.attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
+	.show = elv_iosched_show,
+	.store = elv_iosched_store,
+};
+
 static struct attribute *default_attrs[] = {
 	&queue_requests_entry.attr,
 	&queue_ra_entry.attr,
 	&queue_max_hw_sectors_entry.attr,
 	&queue_max_sectors_entry.attr,
+	&queue_iosched_entry.attr,
 	NULL,
 };
 
diff --git a/drivers/block/noop-iosched.c b/drivers/block/noop-iosched.c
index ffef40be1f92..707dddd7d881 100644
--- a/drivers/block/noop-iosched.c
+++ b/drivers/block/noop-iosched.c
@@ -83,12 +83,31 @@ struct request *elevator_noop_next_request(request_queue_t *q)
 	return NULL;
 }
 
-elevator_t elevator_noop = {
-	.elevator_merge_fn		= elevator_noop_merge,
-	.elevator_merge_req_fn		= elevator_noop_merge_requests,
-	.elevator_next_req_fn		= elevator_noop_next_request,
-	.elevator_add_req_fn		= elevator_noop_add_request,
-	.elevator_name			= "noop",
+static struct elevator_type elevator_noop = {
+	.ops = {
+		.elevator_merge_fn		= elevator_noop_merge,
+		.elevator_merge_req_fn		= elevator_noop_merge_requests,
+		.elevator_next_req_fn		= elevator_noop_next_request,
+		.elevator_add_req_fn		= elevator_noop_add_request,
+	},
+	.elevator_name = "noop",
+	.elevator_owner = THIS_MODULE,
 };
 
-EXPORT_SYMBOL(elevator_noop);
+int noop_init(void)
+{
+	return elv_register(&elevator_noop);
+}
+
+void noop_exit(void)
+{
+	elv_unregister(&elevator_noop);
+}
+
+module_init(noop_init);
+module_exit(noop_exit);
+
+
+MODULE_AUTHOR("Jens Axboe");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("No-op IO scheduler");
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 2249b78487bd..b3714fbd0083 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -1595,8 +1595,8 @@ dasd_alloc_queue(struct dasd_device * device)
 
 	device->request_queue->queuedata = device;
 #if 0
-	elevator_exit(device->request_queue);
-	rc = elevator_init(device->request_queue, &elevator_noop);
+	elevator_exit(device->request_queue->elevator);
+	rc = elevator_init(device->request_queue, "noop");
 	if (rc) {
 		blk_cleanup_queue(device->request_queue);
 		return rc;
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index b7f4e7b8be74..1efc9f21229e 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -225,8 +225,8 @@ tapeblock_setup_device(struct tape_device * device)
 	if (!blkdat->request_queue)
 		return -ENOMEM;
 
-	elevator_exit(blkdat->request_queue);
-	rc = elevator_init(blkdat->request_queue, &elevator_noop);
+	elevator_exit(blkdat->request_queue->elevator);
+	rc = elevator_init(blkdat->request_queue, "noop");
 	if (rc)
 		goto cleanup_queue;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4efe45d1af7e..5e4a6ab84ecb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -19,8 +19,8 @@
 
 struct request_queue;
 typedef struct request_queue request_queue_t;
-struct elevator_s;
-typedef struct elevator_s elevator_t;
+struct elevator_queue;
+typedef struct elevator_queue elevator_t;
 struct request_pm_state;
 
 #define BLKDEV_MIN_RQ	4
@@ -80,6 +80,7 @@ struct request_list {
 	int count[2];
 	mempool_t *rq_pool;
 	wait_queue_head_t wait[2];
+	wait_queue_head_t drain;
 };
 
 #define BLK_MAX_CDB	16
@@ -279,7 +280,7 @@ struct request_queue
 	 */
 	struct list_head	queue_head;
 	struct request		*last_merge;
-	elevator_t		elevator;
+	elevator_t		*elevator;
 
 	/*
 	 * the queue request freelist, one for reads and one for writes
@@ -381,6 +382,7 @@ struct request_queue
 #define QUEUE_FLAG_REENTER	6	/* Re-entrancy avoidance */
 #define QUEUE_FLAG_PLUGGED	7	/* queue is plugged */
 #define QUEUE_FLAG_ORDERED	8	/* supports ordered writes */
+#define QUEUE_FLAG_DRAIN	9	/* draining queue for sched switch */
 
 #define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
@@ -617,6 +619,8 @@ extern void blk_dump_rq_flags(struct request *, char *);
 extern void generic_unplug_device(request_queue_t *);
 extern void __generic_unplug_device(request_queue_t *);
 extern long nr_blockdev_pages(void);
+extern void blk_wait_queue_drained(request_queue_t *);
+extern void blk_finish_queue_drain(request_queue_t *);
 
 int blk_get_queue(request_queue_t *);
 request_queue_t *blk_alloc_queue(int);
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 27e8183f4776..95cdfb5bb790 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -22,9 +22,9 @@ typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, int);
 typedef void (elevator_put_req_fn) (request_queue_t *, struct request *);
 
 typedef int (elevator_init_fn) (request_queue_t *, elevator_t *);
-typedef void (elevator_exit_fn) (request_queue_t *, elevator_t *);
+typedef void (elevator_exit_fn) (elevator_t *);
 
-struct elevator_s
+struct elevator_ops
 {
 	elevator_merge_fn *elevator_merge_fn;
 	elevator_merged_fn *elevator_merged_fn;
@@ -48,12 +48,32 @@ struct elevator_s
 
 	elevator_init_fn *elevator_init_fn;
 	elevator_exit_fn *elevator_exit_fn;
+};
 
-	void *elevator_data;
+#define ELV_NAME_MAX	(16)
 
-	struct kobject kobj;
+/*
+ * identifies an elevator type, such as AS or deadline
+ */
+struct elevator_type
+{
+	struct list_head list;
+	struct elevator_ops ops;
+	struct elevator_type *elevator_type;
 	struct kobj_type *elevator_ktype;
-	const char *elevator_name;
+	char elevator_name[ELV_NAME_MAX];
+	struct module *elevator_owner;
+};
+
+/*
+ * each queue has an elevator_queue assoicated with it
+ */
+struct elevator_queue
+{
+	struct elevator_ops *ops;
+	void *elevator_data;
+	struct kobject kobj;
+	struct elevator_type *elevator_type;
 };
 
 /*
@@ -79,28 +99,19 @@ extern int elv_set_request(request_queue_t *, struct request *, int);
 extern void elv_put_request(request_queue_t *, struct request *);
 
 /*
- * noop I/O scheduler. always merges, always inserts new request at tail
- */
-extern elevator_t elevator_noop;
-
-/*
- * deadline i/o scheduler. uses request time outs to prevent indefinite
- * starvation
- */
-extern elevator_t iosched_deadline;
-
-/*
- * anticipatory I/O scheduler
+ * io scheduler registration
  */
-extern elevator_t iosched_as;
+extern int elv_register(struct elevator_type *);
+extern void elv_unregister(struct elevator_type *);
 
 /*
- * completely fair queueing I/O scheduler
+ * io scheduler sysfs switching
  */
-extern elevator_t iosched_cfq;
+extern ssize_t elv_iosched_show(request_queue_t *, char *);
+extern ssize_t elv_iosched_store(request_queue_t *, const char *, size_t);
 
-extern int elevator_init(request_queue_t *, elevator_t *);
-extern void elevator_exit(request_queue_t *);
+extern int elevator_init(request_queue_t *, char *);
+extern void elevator_exit(elevator_t *);
 extern int elv_rq_merge_ok(struct request *, struct bio *);
 extern int elv_try_merge(struct request *, struct bio *);
 extern int elv_try_last_merge(request_queue_t *, struct bio *);
-- 
cgit v1.2.3


From f9887e4a0cc489b33776d43be7362c1284e68819 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 18 Oct 2004 18:01:41 -0700
Subject: [PATCH] cfq-v2 I/O scheduler update

Here is the next incarnation of the CFQ io scheduler, so far known as
CFQ v2 locally. It attempts to address some of the limitations of the
original CFQ io scheduler (hence forth known as CFQ v1). Some of the
problems with CFQ v1 are:

- It does accounting for the lifetime of the cfq_queue, which is setup
  and torn down for the time when a process has io in flight. For a fork
  heavy work load (such as a kernel compile, for instance), new
  processes can effectively starve io of running processes. This is in
  part due to the fact that CFQ v1 gives preference to a new processes
  to get better latency numbers. Removing that heuristic is not an
  option exactly because of that.

- It makes no attempts to address inter-cfq_queue fairness.

- It makes no attempt to limit upper latency bound of a single request.

- It only provides per-tgid grouping. You need to change the source to
  group on a different criteria.

- It uses a mempool for the cfq_queues. Theoretically this could
  deadlock if io bound processes never exit.

- The may_queue() logic can be unfair since it fluctuates quickly, thus
  leaving processes sleeping while new processes are allowed to allocate
  a request.

CFQ v2 attempts to fix these issues. It uses the process io_context
logic to maintain a cfq_queue lifetime of the duration of the process
(and its io). This means we can now be a lot more clever in deciding
which process is allowed to queue or dispatch io to the device. The
cfq_io_context is per-process per-queue, this is an extension to what AS
currently does in that we truly do have a unique per-process identifier
for io grouping. Busy queues are sorted by service time used, sub sorted
by in_flight requests. Queues that have no io in flight are also
preferred at dispatch time.

Accounting is done on completion time of a request, or with a fixed cost
for tagged command queueing. Requests are fifo'ed like with deadline, to
make sure that a single request doesn't stay in the io scheduler for
ages.

Process grouping is selectable at runtime. I provide 4 grouping
criterias: process group, thread group id, user id, and group id.

As usual, settings are sysfs tweakable in /sys/block/<dev>/queue/iosched

axboe@apu:[.]s/block/hda/queue/iosched $ ls
back_seek_max      fifo_batch_expire  find_best_crq  queued
back_seek_penalty  fifo_expire_async  key_type       show_status
clear_elapsed      fifo_expire_sync   quantum        tagged

In order, each of these settings control:

back_seek_max
back_seek_penalty:
	Useful logic stolen from AS that allow small backwards seeks in
	the io stream if we deem them useful. CFQ uses a strict
	ascending elevator otherwise. _max controls the maximum allowed
	backwards seek, defaulting to 16MiB. _penalty denotes how
	expensive we account a backwards seek compared to a forward
	seek. Default is 2, meaning it's twice as expensive.

clear_elapsed:
	Really a debug switch, will go away in the future. It clears the
	maximum values for completion and dispatch time, shown in
	show_status.

fifo_batch_expire
fifo_batch_async
fifo_batch_sync:
	The settings for the expiry fifo. batch_expire is how often we
	allow the fifo expire to control which request to select.
	Default is 125ms. _async is the deadline for async requests
	(typically writes), _sync is the deadline for sync requests
	(reads and sync writes). Defaults are, respectively, 5 seconds
	and 0.5 seconds.

key_type:
	The grouping key. Can be set to pgid, tgid, uid, or gid. The
	current value is shown bracketed:

	axboe@apu:[.]s/block/hda/queue/iosched $ cat key_type
	[pgid] tgid uid gid

	Default is tgid. To set, simply echo any of the 4 words into the
	file.

quantum:
	The amount of requests we select for dispatch when the driver
	asks for work to do and the current pending list is empty.
	Default is 4.

queued:
	The minimum amount of requests a group is allowed to queue.
	Default is 8.

show_status:
	Debug output showing the current state of the queues.

tagged:
	Set this to 1 if the device is using tagged command queueing.
	This cannot be reliably detected by CFQ yet, since most drivers
	don't use the block layer (well it could, by looking at number
	of requests being between dispatch and completion. but not
	completely reliably). Default is 0.

The patch is a little big, but works reliably here on my laptop. There
are a number of other changes and fixes in there (like converting to
hlist for hashes). The code is commented a lot better, CFQ v1 has
basically no comments (reflecting that it was writting in one go, no
touched or tuned much since then). This is of course only done to
increase the AAF, akpm acceptance factor. Since I'm on the road, I
cannot provide any really good numbers of CFQ v1 compared to v2, maybe
someone will help me out there.

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/as-iosched.c  |    4 +-
 drivers/block/cfq-iosched.c | 1459 ++++++++++++++++++++++++++++++++++++-------
 drivers/block/elevator.c    |    2 +-
 drivers/block/ll_rw_blk.c   |  115 ++--
 include/linux/blkdev.h      |   18 +
 include/linux/elevator.h    |    9 +
 6 files changed, 1337 insertions(+), 270 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/as-iosched.c b/drivers/block/as-iosched.c
index bb3e9b5bab3c..b049848c19c3 100644
--- a/drivers/block/as-iosched.c
+++ b/drivers/block/as-iosched.c
@@ -1828,14 +1828,14 @@ static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 
 static int as_may_queue(request_queue_t *q, int rw)
 {
-	int ret = 0;
+	int ret = ELV_MQUEUE_MAY;
 	struct as_data *ad = q->elevator->elevator_data;
 	struct io_context *ioc;
 	if (ad->antic_status == ANTIC_WAIT_REQ ||
 			ad->antic_status == ANTIC_WAIT_NEXT) {
 		ioc = as_get_io_context();
 		if (ad->io_context == ioc)
-			ret = 1;
+			ret = ELV_MQUEUE_MUST;
 		put_io_context(ioc);
 	}
 
diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c
index 6a424dc65823..738ff90bb2c1 100644
--- a/drivers/block/cfq-iosched.c
+++ b/drivers/block/cfq-iosched.c
@@ -22,96 +22,216 @@
 #include <linux/rbtree.h>
 #include <linux/mempool.h>
 
+static unsigned long max_elapsed_crq;
+static unsigned long max_elapsed_dispatch;
+
 /*
  * tunables
  */
-static int cfq_quantum = 4;
-static int cfq_queued = 8;
+static int cfq_quantum = 4;		/* max queue in one round of service */
+static int cfq_queued = 8;		/* minimum rq allocate limit per-queue*/
+static int cfq_service = HZ;		/* period over which service is avg */
+static int cfq_fifo_expire_r = HZ / 2;	/* fifo timeout for sync requests */
+static int cfq_fifo_expire_w = 5 * HZ;	/* fifo timeout for async requests */
+static int cfq_fifo_rate = HZ / 8;	/* fifo expiry rate */
+static int cfq_back_max = 16 * 1024;	/* maximum backwards seek, in KiB */
+static int cfq_back_penalty = 2;	/* penalty of a backwards seek */
 
+/*
+ * for the hash of cfqq inside the cfqd
+ */
 #define CFQ_QHASH_SHIFT		6
 #define CFQ_QHASH_ENTRIES	(1 << CFQ_QHASH_SHIFT)
-#define list_entry_qhash(entry)	list_entry((entry), struct cfq_queue, cfq_hash)
+#define list_entry_qhash(entry)	hlist_entry((entry), struct cfq_queue, cfq_hash)
 
-#define CFQ_MHASH_SHIFT		8
+/*
+ * for the hash of crq inside the cfqq
+ */
+#define CFQ_MHASH_SHIFT		6
 #define CFQ_MHASH_BLOCK(sec)	((sec) >> 3)
 #define CFQ_MHASH_ENTRIES	(1 << CFQ_MHASH_SHIFT)
-#define CFQ_MHASH_FN(sec)	(hash_long(CFQ_MHASH_BLOCK((sec)),CFQ_MHASH_SHIFT))
-#define ON_MHASH(crq)		!list_empty(&(crq)->hash)
+#define CFQ_MHASH_FN(sec)	hash_long(CFQ_MHASH_BLOCK(sec), CFQ_MHASH_SHIFT)
 #define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
-#define list_entry_hash(ptr)	list_entry((ptr), struct cfq_rq, hash)
+#define list_entry_hash(ptr)	hlist_entry((ptr), struct cfq_rq, hash)
 
 #define list_entry_cfqq(ptr)	list_entry((ptr), struct cfq_queue, cfq_list)
 
-#define RQ_DATA(rq)		((struct cfq_rq *) (rq)->elevator_private)
+#define RQ_DATA(rq)		(rq)->elevator_private
+
+/*
+ * rb-tree defines
+ */
+#define RB_NONE			(2)
+#define RB_EMPTY(node)		((node)->rb_node == NULL)
+#define RB_CLEAR_COLOR(node)	(node)->rb_color = RB_NONE
+#define RB_CLEAR(node)		do {	\
+	(node)->rb_parent = NULL;	\
+	RB_CLEAR_COLOR((node));		\
+	(node)->rb_right = NULL;	\
+	(node)->rb_left = NULL;		\
+} while (0)
+#define RB_CLEAR_ROOT(root)	((root)->rb_node = NULL)
+#define ON_RB(node)		((node)->rb_color != RB_NONE)
+#define rb_entry_crq(node)	rb_entry((node), struct cfq_rq, rb_node)
+#define rq_rb_key(rq)		(rq)->sector
+
+/*
+ * threshold for switching off non-tag accounting
+ */
+#define CFQ_MAX_TAG		(4)
+
+/*
+ * sort key types and names
+ */
+enum {
+	CFQ_KEY_PGID,
+	CFQ_KEY_TGID,
+	CFQ_KEY_UID,
+	CFQ_KEY_GID,
+	CFQ_KEY_LAST,
+};
+
+static char *cfq_key_types[] = { "pgid", "tgid", "uid", "gid", NULL };
+
+/*
+ * spare queue
+ */
+#define CFQ_KEY_SPARE		(~0UL)
 
 static kmem_cache_t *crq_pool;
 static kmem_cache_t *cfq_pool;
-static mempool_t *cfq_mpool;
+static kmem_cache_t *cfq_ioc_pool;
 
 struct cfq_data {
 	struct list_head rr_list;
-	struct list_head *dispatch;
-	struct list_head *cfq_hash;
+	struct list_head empty_list;
 
-	struct list_head *crq_hash;
+	struct hlist_head *cfq_hash;
+	struct hlist_head *crq_hash;
 
+	/* queues on rr_list (ie they have pending requests */
 	unsigned int busy_queues;
+
 	unsigned int max_queued;
 
+	atomic_t ref;
+
+	int key_type;
+
 	mempool_t *crq_pool;
 
 	request_queue_t *queue;
 
+	sector_t last_sector;
+
+	int rq_in_driver;
+
 	/*
-	 * tunables
+	 * tunables, see top of file
 	 */
 	unsigned int cfq_quantum;
 	unsigned int cfq_queued;
+	unsigned int cfq_fifo_expire_r;
+	unsigned int cfq_fifo_expire_w;
+	unsigned int cfq_fifo_batch_expire;
+	unsigned int cfq_back_penalty;
+	unsigned int cfq_back_max;
+	unsigned int find_best_crq;
+
+	unsigned int cfq_tagged;
 };
 
 struct cfq_queue {
-	struct list_head cfq_hash;
+	/* reference count */
+	atomic_t ref;
+	/* parent cfq_data */
+	struct cfq_data *cfqd;
+	/* hash of mergeable requests */
+	struct hlist_node cfq_hash;
+	/* hash key */
+	unsigned long key;
+	/* whether queue is on rr (or empty) list */
+	int on_rr;
+	/* on either rr or empty list of cfqd */
 	struct list_head cfq_list;
+	/* sorted list of pending requests */
 	struct rb_root sort_list;
-	int pid;
+	/* if fifo isn't expired, next request to serve */
+	struct cfq_rq *next_crq;
+	/* requests queued in sort_list */
 	int queued[2];
-#if 0
-	/*
-	 * with a simple addition like this, we can do io priorities. almost.
-	 * does need a split request free list, too.
-	 */
-	int io_prio
-#endif
+	/* currently allocated requests */
+	int allocated[2];
+	/* fifo list of requests in sort_list */
+	struct list_head fifo[2];
+	/* last time fifo expired */
+	unsigned long last_fifo_expire;
+
+	int key_type;
+
+	unsigned long service_start;
+	unsigned long service_used;
+
+	unsigned int max_rate;
+
+	/* number of requests that have been handed to the driver */
+	int in_flight;
+	/* number of currently allocated requests */
+	int alloc_limit[2];
 };
 
 struct cfq_rq {
 	struct rb_node rb_node;
 	sector_t rb_key;
-
 	struct request *request;
+	struct hlist_node hash;
 
 	struct cfq_queue *cfq_queue;
+	struct cfq_io_context *io_context;
+
+	unsigned long service_start;
+	unsigned long queue_start;
 
-	struct list_head hash;
+	unsigned int in_flight : 1;
+	unsigned int accounted : 1;
+	unsigned int is_sync   : 1;
+	unsigned int is_write  : 1;
 };
 
-static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq);
-static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid);
-static void cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-			      struct cfq_rq *crq);
+static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned long);
+static void cfq_dispatch_sort(request_queue_t *, struct cfq_rq *);
+static void cfq_update_next_crq(struct cfq_rq *);
+static void cfq_put_cfqd(struct cfq_data *cfqd);
 
 /*
- * lots of deadline iosched dupes, can be abstracted later...
+ * what the fairness is based on (ie how processes are grouped and
+ * differentiated)
  */
-static inline void __cfq_del_crq_hash(struct cfq_rq *crq)
+static inline unsigned long
+cfq_hash_key(struct cfq_data *cfqd, struct task_struct *tsk)
 {
-	list_del_init(&crq->hash);
+	/*
+	 * optimize this so that ->key_type is the offset into the struct
+	 */
+	switch (cfqd->key_type) {
+		case CFQ_KEY_PGID:
+			return process_group(tsk);
+		default:
+		case CFQ_KEY_TGID:
+			return tsk->tgid;
+		case CFQ_KEY_UID:
+			return tsk->uid;
+		case CFQ_KEY_GID:
+			return tsk->gid;
+	}
 }
 
+/*
+ * lots of deadline iosched dupes, can be abstracted later...
+ */
 static inline void cfq_del_crq_hash(struct cfq_rq *crq)
 {
-	if (ON_MHASH(crq))
-		__cfq_del_crq_hash(crq);
+	hlist_del_init(&crq->hash);
 }
 
 static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
@@ -120,32 +240,32 @@ static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
 
 	if (q->last_merge == crq->request)
 		q->last_merge = NULL;
+
+	cfq_update_next_crq(crq);
 }
 
 static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
 {
-	struct request *rq = crq->request;
+	const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request));
 
-	BUG_ON(ON_MHASH(crq));
+	BUG_ON(!hlist_unhashed(&crq->hash));
 
-	list_add(&crq->hash, &cfqd->crq_hash[CFQ_MHASH_FN(rq_hash_key(rq))]);
+	hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
 }
 
 static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
 {
-	struct list_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
-	struct list_head *entry, *next = hash_list->next;
+	struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
+	struct hlist_node *entry, *next;
 
-	while ((entry = next) != hash_list) {
+	hlist_for_each_safe(entry, next, hash_list) {
 		struct cfq_rq *crq = list_entry_hash(entry);
 		struct request *__rq = crq->request;
 
-		next = entry->next;
-
-		BUG_ON(!ON_MHASH(crq));
+		BUG_ON(hlist_unhashed(&crq->hash));
 
 		if (!rq_mergeable(__rq)) {
-			__cfq_del_crq_hash(crq);
+			cfq_del_crq_hash(crq);
 			continue;
 		}
 
@@ -157,29 +277,257 @@ static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
 }
 
 /*
- * rb tree support functions
+ * Lifted from AS - choose which of crq1 and crq2 that is best served now.
+ * We choose the request that is closest to the head right now. Distance
+ * behind the head are penalized and only allowed to a certain extent.
  */
-#define RB_NONE		(2)
-#define RB_EMPTY(node)	((node)->rb_node == NULL)
-#define RB_CLEAR(node)	((node)->rb_color = RB_NONE)
-#define RB_CLEAR_ROOT(root)	((root)->rb_node = NULL)
-#define ON_RB(node)	((node)->rb_color != RB_NONE)
-#define rb_entry_crq(node)	rb_entry((node), struct cfq_rq, rb_node)
-#define rq_rb_key(rq)		(rq)->sector
+static struct cfq_rq *
+cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
+{
+	sector_t last, s1, s2, d1 = 0, d2 = 0;
+	int r1_wrap = 0, r2_wrap = 0;	/* requests are behind the disk head */
+	unsigned long back_max;
+
+	if (crq1 == NULL || crq1 == crq2)
+		return crq2;
+	if (crq2 == NULL)
+		return crq1;
+
+	s1 = crq1->request->sector;
+	s2 = crq2->request->sector;
 
-static inline void cfq_del_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
+	last = cfqd->last_sector;
+
+#if 0
+	if (!list_empty(&cfqd->queue->queue_head)) {
+		struct list_head *entry = &cfqd->queue->queue_head;
+		unsigned long distance = ~0UL;
+		struct request *rq;
+
+		while ((entry = entry->prev) != &cfqd->queue->queue_head) {
+			rq = list_entry_rq(entry);
+
+			if (blk_barrier_rq(rq))
+				break;
+
+			if (distance < abs(s1 - rq->sector + rq->nr_sectors)) {
+				distance = abs(s1 - rq->sector +rq->nr_sectors);
+				last = rq->sector + rq->nr_sectors;
+			}
+			if (distance < abs(s2 - rq->sector + rq->nr_sectors)) {
+				distance = abs(s2 - rq->sector +rq->nr_sectors);
+				last = rq->sector + rq->nr_sectors;
+			}
+		}
+	}
+#endif
+
+	/*
+	 * by definition, 1KiB is 2 sectors
+	 */
+	back_max = cfqd->cfq_back_max * 2;
+
+	/*
+	 * Strict one way elevator _except_ in the case where we allow
+	 * short backward seeks which are biased as twice the cost of a
+	 * similar forward seek.
+	 */
+	if (s1 >= last)
+		d1 = s1 - last;
+	else if (s1 + back_max >= last)
+		d1 = (last - s1) * cfqd->cfq_back_penalty;
+	else
+		r1_wrap = 1;
+
+	if (s2 >= last)
+		d2 = s2 - last;
+	else if (s2 + back_max >= last)
+		d2 = (last - s2) * cfqd->cfq_back_penalty;
+	else
+		r2_wrap = 1;
+
+	/* Found required data */
+	if (!r1_wrap && r2_wrap)
+		return crq1;
+	else if (!r2_wrap && r1_wrap)
+		return crq2;
+	else if (r1_wrap && r2_wrap) {
+		/* both behind the head */
+		if (s1 <= s2)
+			return crq1;
+		else
+			return crq2;
+	}
+
+	/* Both requests in front of the head */
+	if (d1 < d2)
+		return crq1;
+	else if (d2 < d1)
+		return crq2;
+	else {
+		if (s1 >= s2)
+			return crq1;
+		else
+			return crq2;
+	}
+}
+
+/*
+ * would be nice to take fifo expire time into account as well
+ */
+static struct cfq_rq *
+cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+		  struct cfq_rq *last)
 {
+	struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
+	struct rb_node *rbnext, *rbprev;
+
+	if (!ON_RB(&last->rb_node))
+		return NULL;
+
+	if ((rbnext = rb_next(&last->rb_node)) == NULL)
+		rbnext = rb_first(&cfqq->sort_list);
+
+	rbprev = rb_prev(&last->rb_node);
+
+	if (rbprev)
+		crq_prev = rb_entry_crq(rbprev);
+	if (rbnext)
+		crq_next = rb_entry_crq(rbnext);
+
+	return cfq_choose_req(cfqd, crq_next, crq_prev);
+}
+
+static void cfq_update_next_crq(struct cfq_rq *crq)
+{
+	struct cfq_queue *cfqq = crq->cfq_queue;
+
+	if (cfqq->next_crq == crq)
+		cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
+}
+
+static int cfq_check_sort_rr_list(struct cfq_queue *cfqq)
+{
+	struct list_head *head = &cfqq->cfqd->rr_list;
+	struct list_head *next, *prev;
+
+	/*
+	 * list might still be ordered
+	 */
+	next = cfqq->cfq_list.next;
+	if (next != head) {
+		struct cfq_queue *cnext = list_entry_cfqq(next);
+
+		if (cfqq->service_used > cnext->service_used)
+			return 1;
+	}
+
+	prev = cfqq->cfq_list.prev;
+	if (prev != head) {
+		struct cfq_queue *cprev = list_entry_cfqq(prev);
+
+		if (cfqq->service_used < cprev->service_used)
+			return 1;
+	}
+
+	return 0;
+}
+
+static void cfq_sort_rr_list(struct cfq_queue *cfqq, int new_queue)
+{
+	struct list_head *entry = &cfqq->cfqd->rr_list;
+
+	if (!cfqq->on_rr)
+		return;
+	if (!new_queue && !cfq_check_sort_rr_list(cfqq))
+		return;
+
+	list_del(&cfqq->cfq_list);
+
+	/*
+	 * sort by our mean service_used, sub-sort by in-flight requests
+	 */
+	while ((entry = entry->prev) != &cfqq->cfqd->rr_list) {
+		struct cfq_queue *__cfqq = list_entry_cfqq(entry);
+
+		if (cfqq->service_used > __cfqq->service_used)
+			break;
+		else if (cfqq->service_used == __cfqq->service_used) {
+			struct list_head *prv;
+
+			while ((prv = entry->prev) != &cfqq->cfqd->rr_list) {
+				__cfqq = list_entry_cfqq(prv);
+
+				WARN_ON(__cfqq->service_used > cfqq->service_used);
+				if (cfqq->service_used != __cfqq->service_used)
+					break;
+				if (cfqq->in_flight > __cfqq->in_flight)
+					break;
+
+				entry = prv;
+			}
+		}
+	}
+
+	list_add(&cfqq->cfq_list, entry);
+}
+
+/*
+ * add to busy list of queues for service, trying to be fair in ordering
+ * the pending list according to requests serviced
+ */
+static inline void
+cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	/*
+	 * it's currently on the empty list
+	 */
+	cfqq->on_rr = 1;
+	cfqd->busy_queues++;
+
+	if (time_after(jiffies, cfqq->service_start + cfq_service))
+		cfqq->service_used >>= 3;
+
+	cfq_sort_rr_list(cfqq, 1);
+}
+
+static inline void
+cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	list_move(&cfqq->cfq_list, &cfqd->empty_list);
+	cfqq->on_rr = 0;
+
+	BUG_ON(!cfqd->busy_queues);
+	cfqd->busy_queues--;
+}
+
+/*
+ * rb tree support functions
+ */
+static inline void cfq_del_crq_rb(struct cfq_rq *crq)
+{
+	struct cfq_queue *cfqq = crq->cfq_queue;
+
 	if (ON_RB(&crq->rb_node)) {
-		cfqq->queued[rq_data_dir(crq->request)]--;
+		struct cfq_data *cfqd = cfqq->cfqd;
+
+		BUG_ON(!cfqq->queued[crq->is_sync]);
+
+		cfq_update_next_crq(crq);
+
+		cfqq->queued[crq->is_sync]--;
 		rb_erase(&crq->rb_node, &cfqq->sort_list);
-		crq->cfq_queue = NULL;
+		RB_CLEAR_COLOR(&crq->rb_node);
+
+		if (RB_EMPTY(&cfqq->sort_list) && cfqq->on_rr)
+			cfq_del_cfqq_rr(cfqd, cfqq);
 	}
 }
 
 static struct cfq_rq *
-__cfq_add_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
+__cfq_add_crq_rb(struct cfq_rq *crq)
 {
-	struct rb_node **p = &cfqq->sort_list.rb_node;
+	struct rb_node **p = &crq->cfq_queue->sort_list.rb_node;
 	struct rb_node *parent = NULL;
 	struct cfq_rq *__crq;
 
@@ -199,30 +547,50 @@ __cfq_add_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
 	return NULL;
 }
 
-static void
-cfq_add_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq)
+static void cfq_add_crq_rb(struct cfq_rq *crq)
 {
+	struct cfq_queue *cfqq = crq->cfq_queue;
+	struct cfq_data *cfqd = cfqq->cfqd;
 	struct request *rq = crq->request;
 	struct cfq_rq *__alias;
 
 	crq->rb_key = rq_rb_key(rq);
-	cfqq->queued[rq_data_dir(rq)]++;
-retry:
-	__alias = __cfq_add_crq_rb(cfqq, crq);
-	if (!__alias) {
-		rb_insert_color(&crq->rb_node, &cfqq->sort_list);
-		crq->cfq_queue = cfqq;
-		return;
+	cfqq->queued[crq->is_sync]++;
+
+	/*
+	 * looks a little odd, but the first insert might return an alias.
+	 * if that happens, put the alias on the dispatch list
+	 */
+	while ((__alias = __cfq_add_crq_rb(crq)) != NULL)
+		cfq_dispatch_sort(cfqd->queue, __alias);
+
+	rb_insert_color(&crq->rb_node, &cfqq->sort_list);
+
+	if (!cfqq->on_rr)
+		cfq_add_cfqq_rr(cfqd, cfqq);
+
+	/*
+	 * check if this request is a better next-serve candidate
+	 */
+	cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
+}
+
+static inline void
+cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
+{
+	if (ON_RB(&crq->rb_node)) {
+		rb_erase(&crq->rb_node, &cfqq->sort_list);
+		cfqq->queued[crq->is_sync]--;
 	}
 
-	cfq_dispatch_sort(cfqd, cfqq, __alias);
-	goto retry;
+	cfq_add_crq_rb(crq);
 }
 
 static struct request *
 cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
 {
-	struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->tgid);
+	const unsigned long key = cfq_hash_key(cfqd, current);
+	struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, key);
 	struct rb_node *n;
 
 	if (!cfqq)
@@ -244,23 +612,37 @@ out:
 	return NULL;
 }
 
-static void cfq_remove_request(request_queue_t *q, struct request *rq)
+/*
+ * make sure the service time gets corrected on reissue of this request
+ */
+static void cfq_requeue_request(request_queue_t *q, struct request *rq)
 {
-	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_rq *crq = RQ_DATA(rq);
 
 	if (crq) {
 		struct cfq_queue *cfqq = crq->cfq_queue;
 
+		if (cfqq->cfqd->cfq_tagged) {
+			cfqq->service_used--;
+			cfq_sort_rr_list(cfqq, 0);
+		}
+
+		crq->accounted = 0;
+		cfqq->cfqd->rq_in_driver--;
+	}
+	list_add(&rq->queuelist, &q->queue_head);
+}
+
+static void cfq_remove_request(request_queue_t *q, struct request *rq)
+{
+	struct cfq_rq *crq = RQ_DATA(rq);
+
+	if (crq) {
 		cfq_remove_merge_hints(q, crq);
 		list_del_init(&rq->queuelist);
 
-		if (cfqq) {
-			cfq_del_crq_rb(cfqq, crq);
-
-			if (RB_EMPTY(&cfqq->sort_list))
-				cfq_put_queue(cfqd, cfqq);
-		}
+		if (crq->cfq_queue)
+			cfq_del_crq_rb(crq);
 	}
 }
 
@@ -314,92 +696,240 @@ static void cfq_merged_request(request_queue_t *q, struct request *req)
 	if (ON_RB(&crq->rb_node) && (rq_rb_key(req) != crq->rb_key)) {
 		struct cfq_queue *cfqq = crq->cfq_queue;
 
-		cfq_del_crq_rb(cfqq, crq);
-		cfq_add_crq_rb(cfqd, cfqq, crq);
+		cfq_update_next_crq(crq);
+		cfq_reposition_crq_rb(cfqq, crq);
 	}
 
 	q->last_merge = req;
 }
 
 static void
-cfq_merged_requests(request_queue_t *q, struct request *req,
+cfq_merged_requests(request_queue_t *q, struct request *rq,
 		    struct request *next)
 {
-	cfq_merged_request(q, req);
+	struct cfq_rq *crq = RQ_DATA(rq);
+	struct cfq_rq *cnext = RQ_DATA(next);
+
+	cfq_merged_request(q, rq);
+
+	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist)) {
+		if (time_before(cnext->queue_start, crq->queue_start)) {
+			list_move(&rq->queuelist, &next->queuelist);
+			crq->queue_start = cnext->queue_start;
+		}
+	}
+
+	cfq_update_next_crq(cnext);
 	cfq_remove_request(q, next);
 }
 
-static void
-cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-		  struct cfq_rq *crq)
+/*
+ * we dispatch cfqd->cfq_quantum requests in total from the rr_list queues,
+ * this function sector sorts the selected request to minimize seeks. we start
+ * at cfqd->last_sector, not 0.
+ */
+static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq)
 {
-	struct list_head *head = cfqd->dispatch, *entry = head;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
+	struct cfq_queue *cfqq = crq->cfq_queue;
+	struct list_head *head = &q->queue_head, *entry = head;
 	struct request *__rq;
+	sector_t last;
 
-	cfq_del_crq_rb(cfqq, crq);
-	cfq_remove_merge_hints(cfqd->queue, crq);
+	cfq_del_crq_rb(crq);
+	cfq_remove_merge_hints(q, crq);
+	list_del(&crq->request->queuelist);
 
-	if (!list_empty(head)) {
-		__rq = list_entry_rq(head->next);
+	last = cfqd->last_sector;
+	while ((entry = entry->prev) != head) {
+		__rq = list_entry_rq(entry);
 
-		if (crq->request->sector < __rq->sector) {
-			entry = head->prev;
-			goto link;
+		if (blk_barrier_rq(crq->request))
+			break;
+		if (!blk_fs_request(crq->request))
+			break;
+
+		if (crq->request->sector > __rq->sector)
+			break;
+		if (__rq->sector > last && crq->request->sector < last) {
+			last = crq->request->sector;
+			break;
 		}
 	}
 
-	while ((entry = entry->prev) != head) {
-		__rq = list_entry_rq(entry);
+	cfqd->last_sector = last;
+	crq->in_flight = 1;
+	cfqq->in_flight++;
+	list_add(&crq->request->queuelist, entry);
+}
 
-		if (crq->request->sector <= __rq->sector)
-			break;
+/*
+ * return expired entry, or NULL to just start from scratch in rbtree
+ */
+static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
+{
+	struct cfq_data *cfqd = cfqq->cfqd;
+	const int reads = !list_empty(&cfqq->fifo[0]);
+	const int writes = !list_empty(&cfqq->fifo[1]);
+	unsigned long now = jiffies;
+	struct cfq_rq *crq;
+
+	if (time_before(now, cfqq->last_fifo_expire + cfqd->cfq_fifo_batch_expire))
+		return NULL;
+
+	crq = RQ_DATA(list_entry(cfqq->fifo[0].next, struct request, queuelist));
+	if (reads && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_r)) {
+		cfqq->last_fifo_expire = now;
+		return crq;
 	}
 
-link:
-	list_add_tail(&crq->request->queuelist, entry);
+	crq = RQ_DATA(list_entry(cfqq->fifo[1].next, struct request, queuelist));
+	if (writes && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_w)) {
+		cfqq->last_fifo_expire = now;
+		return crq;
+	}
+
+	return NULL;
 }
 
+/*
+ * dispatch a single request from given queue
+ */
 static inline void
-__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd,
-			struct cfq_queue *cfqq)
+cfq_dispatch_request(request_queue_t *q, struct cfq_data *cfqd,
+		     struct cfq_queue *cfqq)
 {
-	struct cfq_rq *crq = rb_entry_crq(rb_first(&cfqq->sort_list));
+	struct cfq_rq *crq;
+
+	/*
+	 * follow expired path, else get first next available
+	 */
+	if ((crq = cfq_check_fifo(cfqq)) == NULL) {
+		if (cfqd->find_best_crq)
+			crq = cfqq->next_crq;
+		else
+			crq = rb_entry_crq(rb_first(&cfqq->sort_list));
+	}
 
-	cfq_dispatch_sort(cfqd, cfqq, crq);
+	cfqd->last_sector = crq->request->sector + crq->request->nr_sectors;
+
+	/*
+	 * finally, insert request into driver list
+	 */
+	cfq_dispatch_sort(q, crq);
 }
 
-static int cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd)
+static int cfq_dispatch_requests(request_queue_t *q, int max_dispatch)
 {
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq;
 	struct list_head *entry, *tmp;
-	int ret, queued, good_queues;
+	int queued, busy_queues, first_round;
 
 	if (list_empty(&cfqd->rr_list))
 		return 0;
 
-	queued = ret = 0;
+	queued = 0;
+	first_round = 1;
 restart:
-	good_queues = 0;
+	busy_queues = 0;
 	list_for_each_safe(entry, tmp, &cfqd->rr_list) {
-		cfqq = list_entry_cfqq(cfqd->rr_list.next);
+		cfqq = list_entry_cfqq(entry);
 
 		BUG_ON(RB_EMPTY(&cfqq->sort_list));
 
-		__cfq_dispatch_requests(q, cfqd, cfqq);
+		/*
+		 * first round of queueing, only select from queues that
+		 * don't already have io in-flight
+		 */
+		if (first_round && cfqq->in_flight)
+			continue;
+
+		cfq_dispatch_request(q, cfqd, cfqq);
 
-		if (RB_EMPTY(&cfqq->sort_list))
-			cfq_put_queue(cfqd, cfqq);
-		else
-			good_queues++;
+		if (!RB_EMPTY(&cfqq->sort_list))
+			busy_queues++;
 
 		queued++;
-		ret = 1;
 	}
 
-	if ((queued < cfqd->cfq_quantum) && good_queues)
+	if ((queued < max_dispatch) && (busy_queues || first_round)) {
+		first_round = 0;
 		goto restart;
+	}
 
-	return ret;
+	return queued;
+}
+
+static inline void cfq_account_dispatch(struct cfq_rq *crq)
+{
+	struct cfq_queue *cfqq = crq->cfq_queue;
+	struct cfq_data *cfqd = cfqq->cfqd;
+	unsigned long now, elapsed;
+
+	/*
+	 * accounted bit is necessary since some drivers will call
+	 * elv_next_request() many times for the same request (eg ide)
+	 */
+	if (crq->accounted)
+		return;
+
+	now = jiffies;
+	if (cfqq->service_start == ~0UL)
+		cfqq->service_start = now;
+
+	/*
+	 * on drives with tagged command queueing, command turn-around time
+	 * doesn't necessarily reflect the time spent processing this very
+	 * command inside the drive. so do the accounting differently there,
+	 * by just sorting on the number of requests
+	 */
+	if (cfqd->cfq_tagged) {
+		if (time_after(now, cfqq->service_start + cfq_service)) {
+			cfqq->service_start = now;
+			cfqq->service_used /= 10;
+		}
+
+		cfqq->service_used++;
+		cfq_sort_rr_list(cfqq, 0);
+	}
+
+	elapsed = now - crq->queue_start;
+	if (elapsed > max_elapsed_dispatch)
+		max_elapsed_dispatch = elapsed;
+
+	crq->accounted = 1;
+	crq->service_start = now;
+
+	if (++cfqd->rq_in_driver >= CFQ_MAX_TAG && !cfqd->cfq_tagged) {
+		cfqq->cfqd->cfq_tagged = 1;
+		printk("cfq: depth %d reached, tagging now on\n", CFQ_MAX_TAG);
+	}
+}
+
+static inline void
+cfq_account_completion(struct cfq_queue *cfqq, struct cfq_rq *crq)
+{
+	struct cfq_data *cfqd = cfqq->cfqd;
+
+	WARN_ON(!cfqd->rq_in_driver);
+	cfqd->rq_in_driver--;
+
+	if (!cfqd->cfq_tagged) {
+		unsigned long now = jiffies;
+		unsigned long duration = now - crq->service_start;
+
+		if (time_after(now, cfqq->service_start + cfq_service)) {
+			cfqq->service_start = now;
+			cfqq->service_used >>= 3;
+		}
+
+		cfqq->service_used += duration;
+		cfq_sort_rr_list(cfqq, 0);
+
+		if (duration > max_elapsed_crq)
+			max_elapsed_crq = duration;
+	}
 }
 
 static struct request *cfq_next_request(request_queue_t *q)
@@ -407,100 +937,305 @@ static struct request *cfq_next_request(request_queue_t *q)
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct request *rq;
 
-	if (!list_empty(cfqd->dispatch)) {
+	if (!list_empty(&q->queue_head)) {
 		struct cfq_rq *crq;
 dispatch:
-		rq = list_entry_rq(cfqd->dispatch->next);
+		rq = list_entry_rq(q->queue_head.next);
 
-		crq = RQ_DATA(rq);
-		if (crq)
+		if ((crq = RQ_DATA(rq)) != NULL) {
 			cfq_remove_merge_hints(q, crq);
+			cfq_account_dispatch(crq);
+		}
 
 		return rq;
 	}
 
-	if (cfq_dispatch_requests(q, cfqd))
+	if (cfq_dispatch_requests(q, cfqd->cfq_quantum))
 		goto dispatch;
 
 	return NULL;
 }
 
+/*
+ * task holds one reference to the queue, dropped when task exits. each crq
+ * in-flight on this queue also holds a reference, dropped when crq is freed.
+ *
+ * queue lock must be held here.
+ */
+static void cfq_put_queue(struct cfq_queue *cfqq)
+{
+	BUG_ON(!atomic_read(&cfqq->ref));
+
+	if (!atomic_dec_and_test(&cfqq->ref))
+		return;
+
+	BUG_ON(rb_first(&cfqq->sort_list));
+	BUG_ON(cfqq->on_rr);
+
+	cfq_put_cfqd(cfqq->cfqd);
+
+	/*
+	 * it's on the empty list and still hashed
+	 */
+	list_del(&cfqq->cfq_list);
+	hlist_del(&cfqq->cfq_hash);
+	kmem_cache_free(cfq_pool, cfqq);
+}
+
 static inline struct cfq_queue *
-__cfq_find_cfq_hash(struct cfq_data *cfqd, int pid, const int hashval)
+__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key, const int hashval)
 {
-	struct list_head *hash_list = &cfqd->cfq_hash[hashval];
-	struct list_head *entry;
+	struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
+	struct hlist_node *entry, *next;
 
-	list_for_each(entry, hash_list) {
+	hlist_for_each_safe(entry, next, hash_list) {
 		struct cfq_queue *__cfqq = list_entry_qhash(entry);
 
-		if (__cfqq->pid == pid)
+		if (__cfqq->key == key)
 			return __cfqq;
 	}
 
 	return NULL;
 }
 
-static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid)
+static struct cfq_queue *
+cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key)
+{
+	return __cfq_find_cfq_hash(cfqd, key, hash_long(key, CFQ_QHASH_SHIFT));
+}
+
+static inline void
+cfq_rehash_cfqq(struct cfq_data *cfqd, struct cfq_queue **cfqq,
+		struct cfq_io_context *cic)
+{
+	unsigned long hashkey = cfq_hash_key(cfqd, current);
+	unsigned long hashval = hash_long(hashkey, CFQ_QHASH_SHIFT);
+	struct cfq_queue *__cfqq;
+	unsigned long flags;
+
+	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
+
+	hlist_del(&(*cfqq)->cfq_hash);
+
+	__cfqq = __cfq_find_cfq_hash(cfqd, hashkey, hashval);
+	if (!__cfqq || __cfqq == *cfqq) {
+		__cfqq = *cfqq;
+		hlist_add_head(&__cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
+		__cfqq->key_type = cfqd->key_type;
+	} else {
+		atomic_inc(&__cfqq->ref);
+		cic->cfqq = __cfqq;
+		cfq_put_queue(*cfqq);
+		*cfqq = __cfqq;
+	}
+
+	cic->cfqq = __cfqq;
+	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
+}
+
+static void cfq_free_io_context(struct cfq_io_context *cic)
 {
-	const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT);
+	kmem_cache_free(cfq_ioc_pool, cic);
+}
 
-	return __cfq_find_cfq_hash(cfqd, pid, hashval);
+/*
+ * locking hierarchy is: io_context lock -> queue locks
+ */
+static void cfq_exit_io_context(struct cfq_io_context *cic)
+{
+	struct cfq_queue *cfqq = cic->cfqq;
+	struct list_head *entry = &cic->list;
+	request_queue_t *q;
+	unsigned long flags;
+
+	/*
+	 * put the reference this task is holding to the various queues
+	 */
+	spin_lock_irqsave(&cic->ioc->lock, flags);
+	while ((entry = cic->list.next) != &cic->list) {
+		struct cfq_io_context *__cic;
+
+		__cic = list_entry(entry, struct cfq_io_context, list);
+		list_del(entry);
+
+		q = __cic->cfqq->cfqd->queue;
+		spin_lock(q->queue_lock);
+		cfq_put_queue(__cic->cfqq);
+		spin_unlock(q->queue_lock);
+	}
+
+	q = cfqq->cfqd->queue;
+	spin_lock(q->queue_lock);
+	cfq_put_queue(cfqq);
+	spin_unlock(q->queue_lock);
+
+	cic->cfqq = NULL;
+	spin_unlock_irqrestore(&cic->ioc->lock, flags);
 }
 
-static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+static struct cfq_io_context *cfq_alloc_io_context(int gfp_flags)
 {
-	cfqd->busy_queues--;
-	list_del(&cfqq->cfq_list);
-	list_del(&cfqq->cfq_hash);
-	mempool_free(cfqq, cfq_mpool);
+	struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_flags);
+
+	if (cic) {
+		cic->dtor = cfq_free_io_context;
+		cic->exit = cfq_exit_io_context;
+		INIT_LIST_HEAD(&cic->list);
+		cic->cfqq = NULL;
+	}
+
+	return cic;
 }
 
-static struct cfq_queue *__cfq_get_queue(struct cfq_data *cfqd, int pid,
-					 int gfp_mask)
+/*
+ * Setup general io context and cfq io context. There can be several cfq
+ * io contexts per general io context, if this process is doing io to more
+ * than one device managed by cfq. Note that caller is holding a reference to
+ * cfqq, so we don't need to worry about it disappearing
+ */
+static struct cfq_io_context *
+cfq_get_io_context(struct cfq_queue **cfqq, int gfp_flags)
 {
-	const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT);
+	struct cfq_data *cfqd = (*cfqq)->cfqd;
+	struct cfq_queue *__cfqq = *cfqq;
+	struct cfq_io_context *cic;
+	struct io_context *ioc;
+
+	might_sleep_if(gfp_flags & __GFP_WAIT);
+
+	ioc = get_io_context(gfp_flags);
+	if (!ioc)
+		return NULL;
+
+	if ((cic = ioc->cic) == NULL) {
+		cic = cfq_alloc_io_context(gfp_flags);
+
+		if (cic == NULL)
+			goto err;
+
+		ioc->cic = cic;
+		cic->ioc = ioc;
+		cic->cfqq = __cfqq;
+		atomic_inc(&__cfqq->ref);
+	} else {
+		struct cfq_io_context *__cic;
+		unsigned long flags;
+
+		/*
+		 * since the first cic on the list is actually the head
+		 * itself, need to check this here or we'll duplicate an
+		 * cic per ioc for no reason
+		 */
+		if (cic->cfqq == __cfqq)
+			goto out;
+
+		/*
+		 * cic exists, check if we already are there. linear search
+		 * should be ok here, the list will usually not be more than
+		 * 1 or a few entries long
+		 */
+		spin_lock_irqsave(&ioc->lock, flags);
+		list_for_each_entry(__cic, &cic->list, list) {
+			/*
+			 * this process is already holding a reference to
+			 * this queue, so no need to get one more
+			 */
+			if (__cic->cfqq == __cfqq) {
+				cic = __cic;
+				spin_unlock_irqrestore(&ioc->lock, flags);
+				goto out;
+			}
+		}
+		spin_unlock_irqrestore(&ioc->lock, flags);
+
+		/*
+		 * nope, process doesn't have a cic assoicated with this
+		 * cfqq yet. get a new one and add to list
+		 */
+		__cic = cfq_alloc_io_context(gfp_flags);
+		if (__cic == NULL)
+			goto err;
+
+		__cic->ioc = ioc;
+		__cic->cfqq = __cfqq;
+		atomic_inc(&__cfqq->ref);
+		spin_lock_irqsave(&ioc->lock, flags);
+		list_add(&__cic->list, &cic->list);
+		spin_unlock_irqrestore(&ioc->lock, flags);
+
+		cic = __cic;
+		*cfqq = __cfqq;
+	}
+
+out:
+	/*
+	 * if key_type has been changed on the fly, we lazily rehash
+	 * each queue at lookup time
+	 */
+	if ((*cfqq)->key_type != cfqd->key_type)
+		cfq_rehash_cfqq(cfqd, cfqq, cic);
+
+	return cic;
+err:
+	put_io_context(ioc);
+	return NULL;
+}
+
+static struct cfq_queue *
+__cfq_get_queue(struct cfq_data *cfqd, unsigned long key, int gfp_mask)
+{
+	const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
-	request_queue_t *q = cfqd->queue;
 
 retry:
-	cfqq = __cfq_find_cfq_hash(cfqd, pid, hashval);
+	cfqq = __cfq_find_cfq_hash(cfqd, key, hashval);
 
 	if (!cfqq) {
 		if (new_cfqq) {
 			cfqq = new_cfqq;
 			new_cfqq = NULL;
 		} else if (gfp_mask & __GFP_WAIT) {
-			spin_unlock_irq(q->queue_lock);
-			new_cfqq = mempool_alloc(cfq_mpool, gfp_mask);
-			spin_lock_irq(q->queue_lock);
+			spin_unlock_irq(cfqd->queue->queue_lock);
+			new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
+			spin_lock_irq(cfqd->queue->queue_lock);
 			goto retry;
 		} else
-			return NULL;
+			goto out;
+
+		memset(cfqq, 0, sizeof(*cfqq));
 
-		INIT_LIST_HEAD(&cfqq->cfq_hash);
+		INIT_HLIST_NODE(&cfqq->cfq_hash);
 		INIT_LIST_HEAD(&cfqq->cfq_list);
 		RB_CLEAR_ROOT(&cfqq->sort_list);
-
-		cfqq->pid = pid;
-		cfqq->queued[0] = cfqq->queued[1] = 0;
-		list_add(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
+		INIT_LIST_HEAD(&cfqq->fifo[0]);
+		INIT_LIST_HEAD(&cfqq->fifo[1]);
+
+		cfqq->key = key;
+		hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
+		atomic_set(&cfqq->ref, 0);
+		cfqq->cfqd = cfqd;
+		atomic_inc(&cfqd->ref);
+		cfqq->key_type = cfqd->key_type;
+		cfqq->service_start = ~0UL;
 	}
 
 	if (new_cfqq)
-		mempool_free(new_cfqq, cfq_mpool);
+		kmem_cache_free(cfq_pool, new_cfqq);
 
+	atomic_inc(&cfqq->ref);
+out:
+	WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
 	return cfqq;
 }
 
-static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int pid,
-				       int gfp_mask)
+static struct cfq_queue *
+cfq_get_queue(struct cfq_data *cfqd, unsigned long key, int gfp_mask)
 {
 	request_queue_t *q = cfqd->queue;
 	struct cfq_queue *cfqq;
 
 	spin_lock_irq(q->queue_lock);
-	cfqq = __cfq_get_queue(cfqd, pid, gfp_mask);
+	cfqq = __cfq_get_queue(cfqd, key, gfp_mask);
 	spin_unlock_irq(q->queue_lock);
 
 	return cfqq;
@@ -508,24 +1243,14 @@ static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int pid,
 
 static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq)
 {
-	struct cfq_queue *cfqq;
+	crq->is_sync = 0;
+	if (rq_data_dir(crq->request) == READ || current->flags & PF_SYNCWRITE)
+		crq->is_sync = 1;
 
-	cfqq = __cfq_get_queue(cfqd, current->tgid, GFP_ATOMIC);
-	if (cfqq) {
-		cfq_add_crq_rb(cfqd, cfqq, crq);
+	cfq_add_crq_rb(crq);
+	crq->queue_start = jiffies;
 
-		if (list_empty(&cfqq->cfq_list)) {
-			list_add(&cfqq->cfq_list, &cfqd->rr_list);
-			cfqd->busy_queues++;
-		}
-	} else {
-		/*
-		 * should can only happen if the request wasn't allocated
-		 * through blk_alloc_request(), eg stack requests from ide-cd
-		 * (those should be removed) _and_ we are in OOM.
-		 */
-		list_add_tail(&crq->request->queuelist, cfqd->dispatch);
-	}
+	list_add_tail(&crq->request->queuelist, &crq->cfq_queue->fifo[crq->is_sync]);
 }
 
 static void
@@ -536,12 +1261,12 @@ cfq_insert_request(request_queue_t *q, struct request *rq, int where)
 
 	switch (where) {
 		case ELEVATOR_INSERT_BACK:
-			while (cfq_dispatch_requests(q, cfqd))
+			while (cfq_dispatch_requests(q, cfqd->cfq_quantum))
 				;
-			list_add_tail(&rq->queuelist, cfqd->dispatch);
+			list_add_tail(&rq->queuelist, &q->queue_head);
 			break;
 		case ELEVATOR_INSERT_FRONT:
-			list_add(&rq->queuelist, cfqd->dispatch);
+			list_add(&rq->queuelist, &q->queue_head);
 			break;
 		case ELEVATOR_INSERT_SORT:
 			BUG_ON(!blk_fs_request(rq));
@@ -564,10 +1289,25 @@ static int cfq_queue_empty(request_queue_t *q)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 
-	if (list_empty(cfqd->dispatch) && list_empty(&cfqd->rr_list))
-		return 1;
+	return list_empty(&q->queue_head) && list_empty(&cfqd->rr_list);
+}
+
+static void cfq_completed_request(request_queue_t *q, struct request *rq)
+{
+	struct cfq_rq *crq = RQ_DATA(rq);
+
+	if (unlikely(!blk_fs_request(rq)))
+		return;
+
+	if (crq->in_flight) {
+		struct cfq_queue *cfqq = crq->cfq_queue;
+
+		WARN_ON(!cfqq->in_flight);
+		cfqq->in_flight--;
+
+		cfq_account_completion(cfqq, crq);
+	}
 
-	return 0;
 }
 
 static struct request *
@@ -598,90 +1338,167 @@ static int cfq_may_queue(request_queue_t *q, int rw)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq;
-	int ret = 1;
+	int ret = ELV_MQUEUE_MAY;
 
-	if (!cfqd->busy_queues)
-		goto out;
+	if (current->flags & PF_MEMALLOC)
+		return ELV_MQUEUE_MAY;
 
-	cfqq = cfq_find_cfq_hash(cfqd, current->tgid);
+	cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(cfqd, current));
 	if (cfqq) {
-		int limit = (q->nr_requests - cfqd->cfq_queued) / cfqd->busy_queues;
+		int limit = cfqd->max_queued;
 
-		if (limit < 3)
-			limit = 3;
+		if (cfqq->allocated[rw] < cfqd->cfq_queued)
+			return ELV_MQUEUE_MUST;
+
+		if (cfqd->busy_queues)
+			limit = q->nr_requests / cfqd->busy_queues;
+
+		if (limit < cfqd->cfq_queued)
+			limit = cfqd->cfq_queued;
 		else if (limit > cfqd->max_queued)
 			limit = cfqd->max_queued;
 
-		if (cfqq->queued[rw] > limit)
-			ret = 0;
+		if (cfqq->allocated[rw] >= limit) {
+			if (limit > cfqq->alloc_limit[rw])
+				cfqq->alloc_limit[rw] = limit;
+
+			ret = ELV_MQUEUE_NO;
+		}
 	}
-out:
+
 	return ret;
 }
 
+static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq)
+{
+	struct request_list *rl = &q->rq;
+	const int write = waitqueue_active(&rl->wait[WRITE]);
+	const int read = waitqueue_active(&rl->wait[READ]);
+
+	if (read && cfqq->allocated[READ] < cfqq->alloc_limit[READ])
+		wake_up(&rl->wait[READ]);
+	if (write && cfqq->allocated[WRITE] < cfqq->alloc_limit[WRITE])
+		wake_up(&rl->wait[WRITE]);
+}
+
+/*
+ * queue lock held here
+ */
 static void cfq_put_request(request_queue_t *q, struct request *rq)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_rq *crq = RQ_DATA(rq);
-	struct request_list *rl;
-	int other_rw;
 
 	if (crq) {
+		struct cfq_queue *cfqq = crq->cfq_queue;
+
 		BUG_ON(q->last_merge == rq);
-		BUG_ON(ON_MHASH(crq));
+		BUG_ON(!hlist_unhashed(&crq->hash));
+
+		if (crq->io_context)
+			put_io_context(crq->io_context->ioc);
+
+		if (!cfqq->allocated[crq->is_write]) {
+			WARN_ON(1);
+			cfqq->allocated[crq->is_write] = 1;
+		}
+		cfqq->allocated[crq->is_write]--;
 
 		mempool_free(crq, cfqd->crq_pool);
 		rq->elevator_private = NULL;
-	}
 
-	/*
-	 * work-around for may_queue "bug": if a read gets issued and refused
-	 * to queue because writes ate all the allowed slots and no other
-	 * reads are pending for this queue, it could get stuck infinitely
-	 * since freed_request() only checks the waitqueue for writes when
-	 * freeing them. or vice versa for a single write vs many reads.
-	 * so check here whether "the other" data direction might be able
-	 * to queue and wake them
-	 */
-	rl = &q->rq;
-	other_rw = rq_data_dir(rq) ^ 1;
-	if (rl->count[other_rw] <= q->nr_requests) {
 		smp_mb();
-		if (waitqueue_active(&rl->wait[other_rw]))
-			wake_up(&rl->wait[other_rw]);
+		cfq_check_waiters(q, cfqq);
+		cfq_put_queue(cfqq);
 	}
 }
 
+/*
+ * Allocate cfq data structures associated with this request. A queue and
+ */
 static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
+	struct cfq_io_context *cic;
+	const int rw = rq_data_dir(rq);
 	struct cfq_queue *cfqq;
 	struct cfq_rq *crq;
+	unsigned long flags;
+
+	might_sleep_if(gfp_mask & __GFP_WAIT);
+
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	cfqq = __cfq_get_queue(cfqd, cfq_hash_key(cfqd, current), gfp_mask);
+	if (!cfqq) {
+#if 0
+		cfqq = cfq_get_queue(cfqd, CFQ_KEY_SPARE, gfp_mask);
+		printk("%s: got spare queue\n", current->comm);
+#else
+		goto out_lock;
+#endif
+	}
+
+	if (cfqq->allocated[rw] >= cfqd->max_queued)
+		goto out_lock;
+
+	spin_unlock_irqrestore(q->queue_lock, flags);
 
 	/*
-	 * prepare a queue up front, so cfq_enqueue() doesn't have to
+	 * if hashing type has changed, the cfq_queue might change here. we
+	 * don't bother rechecking ->allocated since it should be a rare
+	 * event
 	 */
-	cfqq = cfq_get_queue(cfqd, current->tgid, gfp_mask);
-	if (!cfqq)
-		return 1;
+	cic = cfq_get_io_context(&cfqq, gfp_mask);
+	if (!cic)
+		goto err;
 
 	crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
 	if (crq) {
-		memset(crq, 0, sizeof(*crq));
 		RB_CLEAR(&crq->rb_node);
+		crq->rb_key = 0;
 		crq->request = rq;
-		crq->cfq_queue = NULL;
-		INIT_LIST_HEAD(&crq->hash);
+		INIT_HLIST_NODE(&crq->hash);
+		crq->cfq_queue = cfqq;
+		crq->io_context = cic;
+		crq->service_start = crq->queue_start = 0;
+		crq->in_flight = crq->accounted = crq->is_sync = 0;
+		crq->is_write = rw;
 		rq->elevator_private = crq;
+		cfqq->allocated[rw]++;
+		cfqq->alloc_limit[rw] = 0;
 		return 0;
 	}
 
+	put_io_context(cic->ioc);
+err:
+	spin_lock_irqsave(q->queue_lock, flags);
+	cfq_put_queue(cfqq);
+out_lock:
+	spin_unlock_irqrestore(q->queue_lock, flags);
 	return 1;
 }
 
-static void cfq_exit_queue(elevator_t *e)
+static void cfq_put_cfqd(struct cfq_data *cfqd)
 {
-	struct cfq_data *cfqd = e->elevator_data;
+	request_queue_t *q = cfqd->queue;
+	elevator_t *e = q->elevator;
+	struct cfq_queue *cfqq;
+
+	if (!atomic_dec_and_test(&cfqd->ref))
+		return;
+
+	/*
+	 * kill spare queue, getting it means we have two refences to it.
+	 * drop both
+	 */
+	spin_lock_irq(q->queue_lock);
+	cfqq = __cfq_get_queue(cfqd, CFQ_KEY_SPARE, GFP_ATOMIC);
+	cfq_put_queue(cfqq);
+	cfq_put_queue(cfqq);
+	spin_unlock_irq(q->queue_lock);
+
+	blk_put_queue(q);
 
 	e->elevator_data = NULL;
 	mempool_destroy(cfqd->crq_pool);
@@ -690,9 +1507,15 @@ static void cfq_exit_queue(elevator_t *e)
 	kfree(cfqd);
 }
 
+static void cfq_exit_queue(elevator_t *e)
+{
+	cfq_put_cfqd(e->elevator_data);
+}
+
 static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 {
 	struct cfq_data *cfqd;
+	struct cfq_queue *cfqq;
 	int i;
 
 	cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
@@ -701,12 +1524,13 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 
 	memset(cfqd, 0, sizeof(*cfqd));
 	INIT_LIST_HEAD(&cfqd->rr_list);
+	INIT_LIST_HEAD(&cfqd->empty_list);
 
-	cfqd->crq_hash = kmalloc(sizeof(struct list_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
+	cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
 	if (!cfqd->crq_hash)
 		goto out_crqhash;
 
-	cfqd->cfq_hash = kmalloc(sizeof(struct list_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
+	cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
 	if (!cfqd->cfq_hash)
 		goto out_cfqhash;
 
@@ -715,24 +1539,44 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 		goto out_crqpool;
 
 	for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
-		INIT_LIST_HEAD(&cfqd->crq_hash[i]);
+		INIT_HLIST_HEAD(&cfqd->crq_hash[i]);
 	for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
-		INIT_LIST_HEAD(&cfqd->cfq_hash[i]);
+		INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
 
-	cfqd->dispatch = &q->queue_head;
 	e->elevator_data = cfqd;
+
 	cfqd->queue = q;
+	atomic_inc(&q->refcnt);
+
+	/*
+	 * setup spare failure queue
+	 */
+	cfqq = cfq_get_queue(cfqd, CFQ_KEY_SPARE, GFP_KERNEL);
+	if (!cfqq)
+		goto out_spare;
 
 	/*
 	 * just set it to some high value, we want anyone to be able to queue
 	 * some requests. fairness is handled differently
 	 */
-	cfqd->max_queued = q->nr_requests;
-	q->nr_requests = 8192;
+	q->nr_requests = 1024;
+	cfqd->max_queued = q->nr_requests / 16;
+	q->nr_batching = cfq_queued;
+	cfqd->key_type = CFQ_KEY_TGID;
+	cfqd->find_best_crq = 1;
+	atomic_set(&cfqd->ref, 1);
 
 	cfqd->cfq_queued = cfq_queued;
 	cfqd->cfq_quantum = cfq_quantum;
+	cfqd->cfq_fifo_expire_r = cfq_fifo_expire_r;
+	cfqd->cfq_fifo_expire_w = cfq_fifo_expire_w;
+	cfqd->cfq_fifo_batch_expire = cfq_fifo_rate;
+	cfqd->cfq_back_max = cfq_back_max;
+	cfqd->cfq_back_penalty = cfq_back_penalty;
+
 	return 0;
+out_spare:
+	mempool_destroy(cfqd->crq_pool);
 out_crqpool:
 	kfree(cfqd->cfq_hash);
 out_cfqhash:
@@ -746,13 +1590,13 @@ static void cfq_slab_kill(void)
 {
 	if (crq_pool)
 		kmem_cache_destroy(crq_pool);
-	if (cfq_mpool)
-		mempool_destroy(cfq_mpool);
 	if (cfq_pool)
 		kmem_cache_destroy(cfq_pool);
+	if (cfq_ioc_pool)
+		kmem_cache_destroy(cfq_ioc_pool);
 }
 
-static int cfq_slab_setup(void)
+static int __init cfq_slab_setup(void)
 {
 	crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
 					NULL, NULL);
@@ -764,8 +1608,9 @@ static int cfq_slab_setup(void)
 	if (!cfq_pool)
 		goto fail;
 
-	cfq_mpool = mempool_create(64, mempool_alloc_slab, mempool_free_slab, cfq_pool);
-	if (!cfq_mpool)
+	cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool",
+			sizeof(struct cfq_io_context), 0, 0, NULL, NULL);
+	if (!cfq_ioc_pool)
 		goto fail;
 
 	return 0;
@@ -774,6 +1619,7 @@ fail:
 	return -ENOMEM;
 }
 
+
 /*
  * sysfs parts below -->
  */
@@ -798,6 +1644,94 @@ cfq_var_store(unsigned int *var, const char *page, size_t count)
 	return count;
 }
 
+static ssize_t
+cfq_clear_elapsed(struct cfq_data *cfqd, const char *page, size_t count)
+{
+	max_elapsed_dispatch = max_elapsed_crq = 0;
+	return count;
+}
+
+static ssize_t
+cfq_set_key_type(struct cfq_data *cfqd, const char *page, size_t count)
+{
+	spin_lock_irq(cfqd->queue->queue_lock);
+	if (!strncmp(page, "pgid", 4))
+		cfqd->key_type = CFQ_KEY_PGID;
+	else if (!strncmp(page, "tgid", 4))
+		cfqd->key_type = CFQ_KEY_TGID;
+	else if (!strncmp(page, "uid", 3))
+		cfqd->key_type = CFQ_KEY_UID;
+	else if (!strncmp(page, "gid", 3))
+		cfqd->key_type = CFQ_KEY_GID;
+	spin_unlock_irq(cfqd->queue->queue_lock);
+	return count;
+}
+
+static ssize_t
+cfq_read_key_type(struct cfq_data *cfqd, char *page)
+{
+	ssize_t len = 0;
+	int i;
+
+	for (i = CFQ_KEY_PGID; i < CFQ_KEY_LAST; i++) {
+		if (cfqd->key_type == i)
+			len += sprintf(page+len, "[%s] ", cfq_key_types[i]);
+		else
+			len += sprintf(page+len, "%s ", cfq_key_types[i]);
+	}
+	len += sprintf(page+len, "\n");
+	return len;
+}
+
+static ssize_t
+cfq_status_show(struct cfq_data *cfqd, char *page)
+{
+	struct list_head *entry;
+	struct cfq_queue *cfqq;
+	ssize_t len;
+	int i = 0, queues;
+
+	len = sprintf(page, "Busy queues: %u\n", cfqd->busy_queues);
+	len += sprintf(page+len, "key type: %s\n",
+				cfq_key_types[cfqd->key_type]);
+	len += sprintf(page+len, "last sector: %Lu\n",
+				(unsigned long long)cfqd->last_sector);
+	len += sprintf(page+len, "max time in iosched: %lu\n",
+				max_elapsed_dispatch);
+	len += sprintf(page+len, "max completion time: %lu\n", max_elapsed_crq);
+
+	len += sprintf(page+len, "Busy queue list:\n");
+	spin_lock_irq(cfqd->queue->queue_lock);
+	list_for_each(entry, &cfqd->rr_list) {
+		i++;
+		cfqq = list_entry_cfqq(entry);
+		len += sprintf(page+len, "  cfqq: key=%lu alloc=%d/%d, "
+			"queued=%d/%d, last_fifo=%lu, service_used=%lu\n",
+			cfqq->key, cfqq->allocated[0], cfqq->allocated[1],
+			cfqq->queued[0], cfqq->queued[1],
+			cfqq->last_fifo_expire, cfqq->service_used);
+	}
+	len += sprintf(page+len, "  busy queues total: %d\n", i);
+	queues = i;
+
+	len += sprintf(page+len, "Empty queue list:\n");
+	i = 0;
+	list_for_each(entry, &cfqd->empty_list) {
+		i++;
+		cfqq = list_entry_cfqq(entry);
+		len += sprintf(page+len, "  cfqq: key=%lu alloc=%d/%d, "
+			"queued=%d/%d, last_fifo=%lu, service_used=%lu\n",
+			cfqq->key, cfqq->allocated[0], cfqq->allocated[1],
+			cfqq->queued[0], cfqq->queued[1],
+			cfqq->last_fifo_expire, cfqq->service_used);
+	}
+	len += sprintf(page+len, "  empty queues total: %d\n", i);
+	queues += i;
+	len += sprintf(page+len, "Total queues: %d\n", queues);
+	spin_unlock_irq(cfqd->queue->queue_lock);
+	return len;
+}
+
 #define SHOW_FUNCTION(__FUNC, __VAR)					\
 static ssize_t __FUNC(struct cfq_data *cfqd, char *page)		\
 {									\
@@ -805,6 +1739,12 @@ static ssize_t __FUNC(struct cfq_data *cfqd, char *page)		\
 }
 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum);
 SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued);
+SHOW_FUNCTION(cfq_fifo_expire_r_show, cfqd->cfq_fifo_expire_r);
+SHOW_FUNCTION(cfq_fifo_expire_w_show, cfqd->cfq_fifo_expire_w);
+SHOW_FUNCTION(cfq_fifo_batch_expire_show, cfqd->cfq_fifo_batch_expire);
+SHOW_FUNCTION(cfq_find_best_show, cfqd->find_best_crq);
+SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max);
+SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)				\
@@ -817,8 +1757,14 @@ static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count)	\
 		*(__PTR) = (MAX);					\
 	return ret;							\
 }
-STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, INT_MAX);
-STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, INT_MAX);
+STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX);
+STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX);
+STORE_FUNCTION(cfq_fifo_expire_r_store, &cfqd->cfq_fifo_expire_r, 1, UINT_MAX);
+STORE_FUNCTION(cfq_fifo_expire_w_store, &cfqd->cfq_fifo_expire_w, 1, UINT_MAX);
+STORE_FUNCTION(cfq_fifo_batch_expire_store, &cfqd->cfq_fifo_batch_expire, 0, UINT_MAX);
+STORE_FUNCTION(cfq_find_best_store, &cfqd->find_best_crq, 0, 1);
+STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX);
+STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX);
 #undef STORE_FUNCTION
 
 static struct cfq_fs_entry cfq_quantum_entry = {
@@ -831,10 +1777,62 @@ static struct cfq_fs_entry cfq_queued_entry = {
 	.show = cfq_queued_show,
 	.store = cfq_queued_store,
 };
+static struct cfq_fs_entry cfq_fifo_expire_r_entry = {
+	.attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_fifo_expire_r_show,
+	.store = cfq_fifo_expire_r_store,
+};
+static struct cfq_fs_entry cfq_fifo_expire_w_entry = {
+	.attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_fifo_expire_w_show,
+	.store = cfq_fifo_expire_w_store,
+};
+static struct cfq_fs_entry cfq_fifo_batch_expire_entry = {
+	.attr = {.name = "fifo_batch_expire", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_fifo_batch_expire_show,
+	.store = cfq_fifo_batch_expire_store,
+};
+static struct cfq_fs_entry cfq_find_best_entry = {
+	.attr = {.name = "find_best_crq", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_find_best_show,
+	.store = cfq_find_best_store,
+};
+static struct cfq_fs_entry cfq_back_max_entry = {
+	.attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_back_max_show,
+	.store = cfq_back_max_store,
+};
+static struct cfq_fs_entry cfq_back_penalty_entry = {
+	.attr = {.name = "back_seek_penalty", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_back_penalty_show,
+	.store = cfq_back_penalty_store,
+};
+static struct cfq_fs_entry cfq_clear_elapsed_entry = {
+	.attr = {.name = "clear_elapsed", .mode = S_IWUSR },
+	.store = cfq_clear_elapsed,
+};
+static struct cfq_fs_entry cfq_misc_entry = {
+	.attr = {.name = "show_status", .mode = S_IRUGO },
+	.show = cfq_status_show,
+};
+static struct cfq_fs_entry cfq_key_type_entry = {
+	.attr = {.name = "key_type", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_read_key_type,
+	.store = cfq_set_key_type,
+};
 
 static struct attribute *default_attrs[] = {
 	&cfq_quantum_entry.attr,
 	&cfq_queued_entry.attr,
+	&cfq_fifo_expire_r_entry.attr,
+	&cfq_fifo_expire_w_entry.attr,
+	&cfq_fifo_batch_expire_entry.attr,
+	&cfq_key_type_entry.attr,
+	&cfq_find_best_entry.attr,
+	&cfq_back_max_entry.attr,
+	&cfq_back_penalty_entry.attr,
+	&cfq_clear_elapsed_entry.attr,
+	&cfq_misc_entry.attr,
 	NULL,
 };
 
@@ -883,7 +1881,9 @@ static struct elevator_type iosched_cfq = {
 		.elevator_next_req_fn =		cfq_next_request,
 		.elevator_add_req_fn =		cfq_insert_request,
 		.elevator_remove_req_fn =	cfq_remove_request,
+		.elevator_requeue_req_fn =	cfq_requeue_request,
 		.elevator_queue_empty_fn =	cfq_queue_empty,
+		.elevator_completed_req_fn =	cfq_completed_request,
 		.elevator_former_req_fn =	cfq_former_request,
 		.elevator_latter_req_fn =	cfq_latter_request,
 		.elevator_set_req_fn =		cfq_set_request,
@@ -892,9 +1892,9 @@ static struct elevator_type iosched_cfq = {
 		.elevator_init_fn =		cfq_init_queue,
 		.elevator_exit_fn =		cfq_exit_queue,
 	},
-	.elevator_ktype = &cfq_ktype,
-	.elevator_name = "cfq",
-	.elevator_owner = THIS_MODULE,
+	.elevator_ktype =	&cfq_ktype,
+	.elevator_name =	"cfq",
+	.elevator_owner =	THIS_MODULE,
 };
 
 int cfq_init(void)
@@ -905,9 +1905,12 @@ int cfq_init(void)
 		return -ENOMEM;
 
 	ret = elv_register(&iosched_cfq);
-	if (ret)
-		cfq_slab_kill();
+	if (!ret) {
+		__module_get(THIS_MODULE);
+		return 0;
+	}
 
+	cfq_slab_kill();
 	return ret;
 }
 
diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c
index 92cc7a9a5c63..1b4f6a70c0ca 100644
--- a/drivers/block/elevator.c
+++ b/drivers/block/elevator.c
@@ -437,7 +437,7 @@ int elv_may_queue(request_queue_t *q, int rw)
 	if (e->ops->elevator_may_queue_fn)
 		return e->ops->elevator_may_queue_fn(q, rw);
 
-	return 0;
+	return ELV_MQUEUE_MAY;
 }
 
 void elv_completed_request(request_queue_t *q, struct request *rq)
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index b3780ca0fdc0..3ba6430899df 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -243,6 +243,7 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 	blk_queue_hardsect_size(q, 512);
 	blk_queue_dma_alignment(q, 511);
 	blk_queue_congestion_threshold(q);
+	q->nr_batching = BLK_BATCH_REQ;
 
 	q->unplug_thresh = 4;		/* hmm */
 	q->unplug_delay = (3 * HZ) / 1000;	/* 3 milliseconds */
@@ -1511,8 +1512,10 @@ request_queue_t *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
 	/*
 	 * all done
 	 */
-	if (!elevator_init(q, NULL))
+	if (!elevator_init(q, NULL)) {
+		blk_queue_congestion_threshold(q);
 		return q;
+	}
 
 	blk_cleanup_queue(q);
 out_init:
@@ -1540,13 +1543,20 @@ static inline void blk_free_request(request_queue_t *q, struct request *rq)
 	mempool_free(rq, q->rq.rq_pool);
 }
 
-static inline struct request *blk_alloc_request(request_queue_t *q,int gfp_mask)
+static inline struct request *blk_alloc_request(request_queue_t *q, int rw,
+						int gfp_mask)
 {
 	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
 
 	if (!rq)
 		return NULL;
 
+	/*
+	 * first three bits are identical in rq->flags and bio->bi_rw,
+	 * see bio.h and blkdev.h
+	 */
+	rq->flags = rw;
+
 	if (!elv_set_request(q, rq, gfp_mask))
 		return rq;
 
@@ -1558,7 +1568,7 @@ static inline struct request *blk_alloc_request(request_queue_t *q,int gfp_mask)
  * ioc_batching returns true if the ioc is a valid batching request and
  * should be given priority access to a request.
  */
-static inline int ioc_batching(struct io_context *ioc)
+static inline int ioc_batching(request_queue_t *q, struct io_context *ioc)
 {
 	if (!ioc)
 		return 0;
@@ -1568,7 +1578,7 @@ static inline int ioc_batching(struct io_context *ioc)
 	 * even if the batch times out, otherwise we could theoretically
 	 * lose wakeups.
 	 */
-	return ioc->nr_batch_requests == BLK_BATCH_REQ ||
+	return ioc->nr_batch_requests == q->nr_batching ||
 		(ioc->nr_batch_requests > 0
 		&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
 }
@@ -1579,12 +1589,12 @@ static inline int ioc_batching(struct io_context *ioc)
  * is the behaviour we want though - once it gets a wakeup it should be given
  * a nice run.
  */
-void ioc_set_batching(struct io_context *ioc)
+void ioc_set_batching(request_queue_t *q, struct io_context *ioc)
 {
-	if (!ioc || ioc_batching(ioc))
+	if (!ioc || ioc_batching(q, ioc))
 		return;
 
-	ioc->nr_batch_requests = BLK_BATCH_REQ;
+	ioc->nr_batch_requests = q->nr_batching;
 	ioc->last_waited = jiffies;
 }
 
@@ -1600,10 +1610,10 @@ static void freed_request(request_queue_t *q, int rw)
 	if (rl->count[rw] < queue_congestion_off_threshold(q))
 		clear_queue_congested(q, rw);
 	if (rl->count[rw]+1 <= q->nr_requests) {
+		smp_mb();
 		if (waitqueue_active(&rl->wait[rw]))
 			wake_up(&rl->wait[rw]);
-		if (!waitqueue_active(&rl->wait[rw]))
-			blk_clear_queue_full(q, rw);
+		blk_clear_queue_full(q, rw);
 	}
 	if (unlikely(waitqueue_active(&rl->drain)) &&
 	    !rl->count[READ] && !rl->count[WRITE])
@@ -1632,13 +1642,22 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
 		 * will be blocked.
 		 */
 		if (!blk_queue_full(q, rw)) {
-			ioc_set_batching(ioc);
+			ioc_set_batching(q, ioc);
 			blk_set_queue_full(q, rw);
 		}
 	}
 
-	if (blk_queue_full(q, rw)
-			&& !ioc_batching(ioc) && !elv_may_queue(q, rw)) {
+	switch (elv_may_queue(q, rw)) {
+		case ELV_MQUEUE_NO:
+			spin_unlock_irq(q->queue_lock);
+			goto out;
+		case ELV_MQUEUE_MAY:
+			break;
+		case ELV_MQUEUE_MUST:
+			goto get_rq;
+	}
+
+	if (blk_queue_full(q, rw) && !ioc_batching(q, ioc)) {
 		/*
 		 * The queue is full and the allocating process is not a
 		 * "batcher", and not exempted by the IO scheduler
@@ -1647,12 +1666,13 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
 		goto out;
 	}
 
+get_rq:
 	rl->count[rw]++;
 	if (rl->count[rw] >= queue_congestion_on_threshold(q))
 		set_queue_congested(q, rw);
 	spin_unlock_irq(q->queue_lock);
 
-	rq = blk_alloc_request(q, gfp_mask);
+	rq = blk_alloc_request(q, rw, gfp_mask);
 	if (!rq) {
 		/*
 		 * Allocation failed presumably due to memory. Undo anything
@@ -1667,17 +1687,11 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
 		goto out;
 	}
 
-	if (ioc_batching(ioc))
+	if (ioc_batching(q, ioc))
 		ioc->nr_batch_requests--;
 	
 	INIT_LIST_HEAD(&rq->queuelist);
 
-	/*
-	 * first three bits are identical in rq->flags and bio->bi_rw,
-	 * see bio.h and blkdev.h
-	 */
-	rq->flags = rw;
-
 	rq->errors = 0;
 	rq->rq_status = RQ_ACTIVE;
 	rq->bio = rq->biotail = NULL;
@@ -1726,7 +1740,7 @@ static struct request *get_request_wait(request_queue_t *q, int rw)
 			 * See ioc_batching, ioc_set_batching
 			 */
 			ioc = get_io_context(GFP_NOIO);
-			ioc_set_batching(ioc);
+			ioc_set_batching(q, ioc);
 			put_io_context(ioc);
 		}
 		finish_wait(&rl->wait[rw], &wait);
@@ -3082,6 +3096,9 @@ void put_io_context(struct io_context *ioc)
 	if (atomic_dec_and_test(&ioc->refcount)) {
 		if (ioc->aic && ioc->aic->dtor)
 			ioc->aic->dtor(ioc->aic);
+		if (ioc->cic && ioc->cic->dtor)
+			ioc->cic->dtor(ioc->cic);
+
 		kmem_cache_free(iocontext_cachep, ioc);
 	}
 }
@@ -3095,14 +3112,15 @@ void exit_io_context(void)
 
 	local_irq_save(flags);
 	ioc = current->io_context;
-	if (ioc) {
-		if (ioc->aic && ioc->aic->exit)
-			ioc->aic->exit(ioc->aic);
-		put_io_context(ioc);
-		current->io_context = NULL;
-	} else
-		WARN_ON(1);
+	current->io_context = NULL;
 	local_irq_restore(flags);
+
+	if (ioc->aic && ioc->aic->exit)
+		ioc->aic->exit(ioc->aic);
+	if (ioc->cic && ioc->cic->exit)
+		ioc->cic->exit(ioc->cic);
+
+	put_io_context(ioc);
 }
 
 /*
@@ -3121,20 +3139,39 @@ struct io_context *get_io_context(int gfp_flags)
 
 	local_irq_save(flags);
 	ret = tsk->io_context;
-	if (ret == NULL) {
-		ret = kmem_cache_alloc(iocontext_cachep, GFP_ATOMIC);
-		if (ret) {
-			atomic_set(&ret->refcount, 1);
-			ret->pid = tsk->pid;
-			ret->last_waited = jiffies; /* doesn't matter... */
-			ret->nr_batch_requests = 0; /* because this is 0 */
-			ret->aic = NULL;
+	if (ret)
+		goto out;
+
+	local_irq_restore(flags);
+
+	ret = kmem_cache_alloc(iocontext_cachep, gfp_flags);
+	if (ret) {
+		atomic_set(&ret->refcount, 1);
+		ret->pid = tsk->pid;
+		ret->last_waited = jiffies; /* doesn't matter... */
+		ret->nr_batch_requests = 0; /* because this is 0 */
+		ret->aic = NULL;
+		ret->cic = NULL;
+		spin_lock_init(&ret->lock);
+
+		local_irq_save(flags);
+
+		/*
+		 * very unlikely, someone raced with us in setting up the task
+		 * io context. free new context and just grab a reference.
+		 */
+		if (!tsk->io_context)
 			tsk->io_context = ret;
+		else {
+			kmem_cache_free(iocontext_cachep, ret);
+			ret = tsk->io_context;
 		}
-	}
-	if (ret)
+
+out:
 		atomic_inc(&ret->refcount);
-	local_irq_restore(flags);
+		local_irq_restore(flags);
+	}
+
 	return ret;
 }
 EXPORT_SYMBOL(get_io_context);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5e4a6ab84ecb..b2059869cb92 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -52,6 +52,20 @@ struct as_io_context {
 	sector_t seek_mean;
 };
 
+struct cfq_queue;
+struct cfq_io_context {
+	void (*dtor)(struct cfq_io_context *);
+	void (*exit)(struct cfq_io_context *);
+
+	struct io_context *ioc;
+
+	/*
+	 * circular list of cfq_io_contexts belonging to a process io context
+	 */
+	struct list_head list;
+	struct cfq_queue *cfqq;
+};
+
 /*
  * This is the per-process I/O subsystem state.  It is refcounted and
  * kmalloc'ed. Currently all fields are modified in process io context
@@ -67,7 +81,10 @@ struct io_context {
 	unsigned long last_waited; /* Time last woken after wait for request */
 	int nr_batch_requests;     /* Number of requests left in the batch */
 
+	spinlock_t lock;
+
 	struct as_io_context *aic;
+	struct cfq_io_context *cic;
 };
 
 void put_io_context(struct io_context *ioc);
@@ -343,6 +360,7 @@ struct request_queue
 	unsigned long		nr_requests;	/* Max # of requests */
 	unsigned int		nr_congestion_on;
 	unsigned int		nr_congestion_off;
+	unsigned int		nr_batching;
 
 	unsigned short		max_sectors;
 	unsigned short		max_hw_sectors;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 95cdfb5bb790..8cf0e3f290bf 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -130,4 +130,13 @@ extern int elv_try_last_merge(request_queue_t *, struct bio *);
 #define ELEVATOR_INSERT_BACK	2
 #define ELEVATOR_INSERT_SORT	3
 
+/*
+ * return values from elevator_may_queue_fn
+ */
+enum {
+	ELV_MQUEUE_MAY,
+	ELV_MQUEUE_NO,
+	ELV_MQUEUE_MUST,
+};
+
 #endif
-- 
cgit v1.2.3


From d55249d351bc96f49e30bc3e5dfa1dad5034cc28 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 18 Oct 2004 18:01:53 -0700
Subject: [PATCH] convert jiffies <-> msecs for io schedulers

The various io schedulers don't convert to and from jiffies and ms in their
sysfs exported values.  This patch adds that.

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/as-iosched.c       |  7 ++---
 drivers/block/cfq-iosched.c      | 58 +++++++++++++++++++++++-----------------
 drivers/block/deadline-iosched.c | 46 ++++++++++++++++++-------------
 3 files changed, 64 insertions(+), 47 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/as-iosched.c b/drivers/block/as-iosched.c
index b049848c19c3..0aa3ee8c309b 100644
--- a/drivers/block/as-iosched.c
+++ b/drivers/block/as-iosched.c
@@ -1962,10 +1962,10 @@ static ssize_t as_est_show(struct as_data *ad, char *page)
 	return pos;
 }
 
-#define SHOW_FUNCTION(__FUNC, __VAR)					\
+#define SHOW_FUNCTION(__FUNC, __VAR)				\
 static ssize_t __FUNC(struct as_data *ad, char *page)		\
-{									\
-	return as_var_show(__VAR, (page));			\
+{								\
+	return as_var_show(jiffies_to_msecs((__VAR)), (page));	\
 }
 SHOW_FUNCTION(as_readexpire_show, ad->fifo_expire[REQ_SYNC]);
 SHOW_FUNCTION(as_writeexpire_show, ad->fifo_expire[REQ_ASYNC]);
@@ -1982,6 +1982,7 @@ static ssize_t __FUNC(struct as_data *ad, const char *page, size_t count)	\
 		*(__PTR) = (MIN);					\
 	else if (*(__PTR) > (MAX))					\
 		*(__PTR) = (MAX);					\
+	*(__PTR) = msecs_to_jiffies(*(__PTR));				\
 	return ret;							\
 }
 STORE_FUNCTION(as_readexpire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX);
diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c
index 738ff90bb2c1..cf7fc7609e67 100644
--- a/drivers/block/cfq-iosched.c
+++ b/drivers/block/cfq-iosched.c
@@ -1732,39 +1732,47 @@ cfq_status_show(struct cfq_data *cfqd, char *page)
 	return len;
 }
 
-#define SHOW_FUNCTION(__FUNC, __VAR)					\
+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
 static ssize_t __FUNC(struct cfq_data *cfqd, char *page)		\
 {									\
-	return cfq_var_show(__VAR, (page));				\
-}
-SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum);
-SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued);
-SHOW_FUNCTION(cfq_fifo_expire_r_show, cfqd->cfq_fifo_expire_r);
-SHOW_FUNCTION(cfq_fifo_expire_w_show, cfqd->cfq_fifo_expire_w);
-SHOW_FUNCTION(cfq_fifo_batch_expire_show, cfqd->cfq_fifo_batch_expire);
-SHOW_FUNCTION(cfq_find_best_show, cfqd->find_best_crq);
-SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max);
-SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty);
+	unsigned int __data = __VAR;					\
+	if (__CONV)							\
+		__data = jiffies_to_msecs(__data);			\
+	return cfq_var_show(__data, (page));				\
+}
+SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
+SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
+SHOW_FUNCTION(cfq_fifo_expire_r_show, cfqd->cfq_fifo_expire_r, 1);
+SHOW_FUNCTION(cfq_fifo_expire_w_show, cfqd->cfq_fifo_expire_w, 1);
+SHOW_FUNCTION(cfq_fifo_batch_expire_show, cfqd->cfq_fifo_batch_expire, 1);
+SHOW_FUNCTION(cfq_find_best_show, cfqd->find_best_crq, 0);
+SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max, 0);
+SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty, 0);
 #undef SHOW_FUNCTION
 
-#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)				\
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
 static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count)	\
 {									\
-	int ret = cfq_var_store(__PTR, (page), count);			\
-	if (*(__PTR) < (MIN))						\
-		*(__PTR) = (MIN);					\
-	else if (*(__PTR) > (MAX))					\
-		*(__PTR) = (MAX);					\
+	unsigned int __data;						\
+	int ret = cfq_var_store(&__data, (page), count);		\
+	if (__data < (MIN))						\
+		__data = (MIN);						\
+	else if (__data > (MAX))					\
+		__data = (MAX);						\
+	if (__CONV)							\
+		*(__PTR) = msecs_to_jiffies(__data);			\
+	else								\
+		*(__PTR) = __data;					\
 	return ret;							\
 }
-STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX);
-STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX);
-STORE_FUNCTION(cfq_fifo_expire_r_store, &cfqd->cfq_fifo_expire_r, 1, UINT_MAX);
-STORE_FUNCTION(cfq_fifo_expire_w_store, &cfqd->cfq_fifo_expire_w, 1, UINT_MAX);
-STORE_FUNCTION(cfq_fifo_batch_expire_store, &cfqd->cfq_fifo_batch_expire, 0, UINT_MAX);
-STORE_FUNCTION(cfq_find_best_store, &cfqd->find_best_crq, 0, 1);
-STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX);
-STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX);
+STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
+STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
+STORE_FUNCTION(cfq_fifo_expire_r_store, &cfqd->cfq_fifo_expire_r, 1, UINT_MAX, 1);
+STORE_FUNCTION(cfq_fifo_expire_w_store, &cfqd->cfq_fifo_expire_w, 1, UINT_MAX, 1);
+STORE_FUNCTION(cfq_fifo_batch_expire_store, &cfqd->cfq_fifo_batch_expire, 0, UINT_MAX, 1);
+STORE_FUNCTION(cfq_find_best_store, &cfqd->find_best_crq, 0, 1, 0);
+STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
+STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
 #undef STORE_FUNCTION
 
 static struct cfq_fs_entry cfq_quantum_entry = {
diff --git a/drivers/block/deadline-iosched.c b/drivers/block/deadline-iosched.c
index 0d3e2411f1d3..f482e8bdb4d6 100644
--- a/drivers/block/deadline-iosched.c
+++ b/drivers/block/deadline-iosched.c
@@ -805,33 +805,41 @@ deadline_var_store(unsigned int *var, const char *page, size_t count)
 	return count;
 }
 
-#define SHOW_FUNCTION(__FUNC, __VAR)					\
+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
 static ssize_t __FUNC(struct deadline_data *dd, char *page)		\
 {									\
-	return deadline_var_show(__VAR, (page));			\
-}
-SHOW_FUNCTION(deadline_readexpire_show, dd->fifo_expire[READ]);
-SHOW_FUNCTION(deadline_writeexpire_show, dd->fifo_expire[WRITE]);
-SHOW_FUNCTION(deadline_writesstarved_show, dd->writes_starved);
-SHOW_FUNCTION(deadline_frontmerges_show, dd->front_merges);
-SHOW_FUNCTION(deadline_fifobatch_show, dd->fifo_batch);
+	unsigned int __data = __VAR;					\
+	if (__CONV)							\
+		__data = jiffies_to_msecs(__data);			\
+	return deadline_var_show(__data, (page));			\
+}
+SHOW_FUNCTION(deadline_readexpire_show, dd->fifo_expire[READ], 1);
+SHOW_FUNCTION(deadline_writeexpire_show, dd->fifo_expire[WRITE], 1);
+SHOW_FUNCTION(deadline_writesstarved_show, dd->writes_starved, 0);
+SHOW_FUNCTION(deadline_frontmerges_show, dd->front_merges, 0);
+SHOW_FUNCTION(deadline_fifobatch_show, dd->fifo_batch, 0);
 #undef SHOW_FUNCTION
 
-#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)				\
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
 static ssize_t __FUNC(struct deadline_data *dd, const char *page, size_t count)	\
 {									\
-	int ret = deadline_var_store(__PTR, (page), count);		\
-	if (*(__PTR) < (MIN))						\
-		*(__PTR) = (MIN);					\
-	else if (*(__PTR) > (MAX))					\
-		*(__PTR) = (MAX);					\
+	unsigned int __data;						\
+	int ret = deadline_var_store(&__data, (page), count);		\
+	if (__data < (MIN))						\
+		__data = (MIN);						\
+	else if (__data > (MAX))					\
+		__data = (MAX);						\
+	if (__CONV)							\
+		*(__PTR) = msecs_to_jiffies(__data);			\
+	else								\
+		*(__PTR) = __data;					\
 	return ret;							\
 }
-STORE_FUNCTION(deadline_readexpire_store, &dd->fifo_expire[READ], 0, INT_MAX);
-STORE_FUNCTION(deadline_writeexpire_store, &dd->fifo_expire[WRITE], 0, INT_MAX);
-STORE_FUNCTION(deadline_writesstarved_store, &dd->writes_starved, INT_MIN, INT_MAX);
-STORE_FUNCTION(deadline_frontmerges_store, &dd->front_merges, 0, 1);
-STORE_FUNCTION(deadline_fifobatch_store, &dd->fifo_batch, 0, INT_MAX);
+STORE_FUNCTION(deadline_readexpire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_writeexpire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_writesstarved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
+STORE_FUNCTION(deadline_frontmerges_store, &dd->front_merges, 0, 1, 0);
+STORE_FUNCTION(deadline_fifobatch_store, &dd->fifo_batch, 0, INT_MAX, 0);
 #undef STORE_FUNCTION
 
 static struct deadline_fs_entry deadline_readexpire_entry = {
-- 
cgit v1.2.3


From 3e57f72ee2c5cd45515c3e17f6db8a49a1e56a0a Mon Sep 17 00:00:00 2001
From: Maximilian Attems <janitor@sternwelten.at>
Date: Mon, 18 Oct 2004 18:18:46 -0700
Subject: [PATCH] janitor: cpqarray remove unused include

  remove unused #include <linux/version.h>
  Old ifdefs were removed that used it's definition.

Signed-off-by: Maximilian Attems <janitor@sternwelten.at>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/cpqarray.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 204b3182900d..dc896a12283b 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -21,7 +21,6 @@
  */
 #include <linux/config.h>	/* CONFIG_PROC_FS */
 #include <linux/module.h>
-#include <linux/version.h>
 #include <linux/types.h>
 #include <linux/pci.h>
 #include <linux/bio.h>
@@ -732,7 +731,6 @@ static void __iomem *remap_pci_mem(ulong base, ulong size)
 }
 
 #ifndef MODULE
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,13)
 /*
  * Config string is a comma separated set of i/o addresses of EISA cards.
  */
@@ -749,18 +747,6 @@ static int cpqarray_setup(char *str)
 
 __setup("smart2=", cpqarray_setup);
 
-#else
-
-/*
- * Copy the contents of the ints[] array passed to us by init.
- */
-void cpqarray_setup(char *str, int *ints)
-{
-	int i;
-	for(i=0; i<ints[0] && i<8; i++)
-		eisa[i] = ints[i+1];
-}
-#endif
 #endif
 
 /*
-- 
cgit v1.2.3