Trivial Makefile merge

author: Linus Torvalds <torvalds@ppc970.osdl.org> 2004-10-18 20:50:22 -0700
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2004-10-18 20:50:22 -0700
commit: 098fc560ef2bbd1bde80845c898fa95db616eb6c (patch)
tree: ca722c6fdbdffe9b7cfd31d61e8f4aae906a319c /drivers/block
parent: bffe01870598b7a0a77073e25ee94e026bc98e6b (diff)
parent: 2a136606fe21b603a0ce484fc578f862f8e8384d (diff)
12 files changed, 4735 insertions, 601 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index a1d50242b8cd..6a43c807497d 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -356,6 +356,39 @@ config LBD
 	  your machine, or if you want to have a raid or loopback device
 	  bigger than 2TB.  Otherwise say N.
 
+config CDROM_PKTCDVD
+	tristate "Packet writing on CD/DVD media"
+	help
+	  If you have a CDROM drive that supports packet writing, say Y to
+	  include preliminary support. It should work with any MMC/Mt Fuji
+	  compliant ATAPI or SCSI drive, which is just about any newer CD
+	  writer.
+
+	  Currently only writing to CD-RW, DVD-RW and DVD+RW discs is possible.
+	  DVD-RW disks must be in restricted overwrite mode.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called pktcdvd.
+
+config CDROM_PKTCDVD_BUFFERS
+	int "Free buffers for data gathering"
+	depends on CDROM_PKTCDVD
+	default "8"
+	help
+	  This controls the maximum number of active concurrent packets. More
+	  concurrent packets can increase write performance, but also require
+	  more memory. Each concurrent packet will require approximately 64Kb
+	  of non-swappable kernel memory, memory which will be allocated at
+	  pktsetup time.
+
+config CDROM_PKTCDVD_WCACHE
+	bool "Enable write caching"
+	depends on CDROM_PKTCDVD
+	help
+	  If enabled, write caching will be set for the CD-R/W device. For now
+	  this option is dangerous unless the CD-RW media is known good, as we
+	  don't do deferred write error handling yet.
+
 source "drivers/s390/block/Kconfig"
 
 endmenu
diff --git a/drivers/block/Kconfig.iosched b/drivers/block/Kconfig.iosched
index d938c5fd130b..e0ba6c93717e 100644
--- a/drivers/block/Kconfig.iosched
+++ b/drivers/block/Kconfig.iosched
@@ -1,5 +1,5 @@
 config IOSCHED_NOOP
-	bool "No-op I/O scheduler" if EMBEDDED
+	bool
 	default y
 	---help---
 	  The no-op I/O scheduler is a minimal scheduler that does basic merging
@@ -9,7 +9,7 @@ config IOSCHED_NOOP
 	  the kernel.
 
 config IOSCHED_AS
-	bool "Anticipatory I/O scheduler" if EMBEDDED
+	tristate "Anticipatory I/O scheduler"
 	default y
 	---help---
 	  The anticipatory I/O scheduler is the default disk scheduler. It is
@@ -18,7 +18,7 @@ config IOSCHED_AS
 	  slower in some cases especially some database loads.
 
 config IOSCHED_DEADLINE
-	bool "Deadline I/O scheduler" if EMBEDDED
+	tristate "Deadline I/O scheduler"
 	default y
 	---help---
 	  The deadline I/O scheduler is simple and compact, and is often as
@@ -28,7 +28,7 @@ config IOSCHED_DEADLINE
 	  anticipatory I/O scheduler and so is a good choice.
 
 config IOSCHED_CFQ
-	bool "CFQ I/O scheduler" if EMBEDDED
+	tristate "CFQ I/O scheduler"
 	default y
 	---help---
 	  The CFQ I/O scheduler tries to distribute bandwidth equally
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index c8fbbf14ce94..1cf09a1c065b 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_BLK_DEV_XD)	+= xd.o
 obj-$(CONFIG_BLK_CPQ_DA)	+= cpqarray.o
 obj-$(CONFIG_BLK_CPQ_CISS_DA)  += cciss.o
 obj-$(CONFIG_BLK_DEV_DAC960)	+= DAC960.o
+obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
 
 obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
 obj-$(CONFIG_BLK_DEV_NBD)	+= nbd.o
diff --git a/drivers/block/as-iosched.c b/drivers/block/as-iosched.c
index 0ef6a665d93e..0aa3ee8c309b 100644
--- a/drivers/block/as-iosched.c
+++ b/drivers/block/as-iosched.c
@@ -614,7 +614,7 @@ static void as_antic_stop(struct as_data *ad)
 static void as_antic_timeout(unsigned long data)
 {
 	struct request_queue *q = (struct request_queue *)data;
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	unsigned long flags;
 
 	spin_lock_irqsave(q->queue_lock, flags);
@@ -945,7 +945,7 @@ static void update_write_batch(struct as_data *ad)
  */
 static void as_completed_request(request_queue_t *q, struct request *rq)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = RQ_DATA(rq);
 
 	WARN_ON(!list_empty(&rq->queuelist));
@@ -1030,7 +1030,7 @@ static void as_remove_queued_request(request_queue_t *q, struct request *rq)
 {
 	struct as_rq *arq = RQ_DATA(rq);
 	const int data_dir = arq->is_sync;
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 
 	WARN_ON(arq->state != AS_RQ_QUEUED);
 
@@ -1361,7 +1361,7 @@ fifo_expired:
 
 static struct request *as_next_request(request_queue_t *q)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct request *rq = NULL;
 
 	/*
@@ -1469,7 +1469,7 @@ static void as_add_request(struct as_data *ad, struct as_rq *arq)
  */
 static void as_requeue_request(request_queue_t *q, struct request *rq)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = RQ_DATA(rq);
 
 	if (arq) {
@@ -1509,7 +1509,7 @@ static void as_account_queued_request(struct as_data *ad, struct request *rq)
 static void
 as_insert_request(request_queue_t *q, struct request *rq, int where)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = RQ_DATA(rq);
 
 	if (arq) {
@@ -1562,7 +1562,7 @@ as_insert_request(request_queue_t *q, struct request *rq, int where)
  */
 static int as_queue_empty(request_queue_t *q)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 
 	if (!list_empty(&ad->fifo_list[REQ_ASYNC])
 		|| !list_empty(&ad->fifo_list[REQ_SYNC])
@@ -1601,7 +1601,7 @@ as_latter_request(request_queue_t *q, struct request *rq)
 static int
 as_merge(request_queue_t *q, struct request **req, struct bio *bio)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	sector_t rb_key = bio->bi_sector + bio_sectors(bio);
 	struct request *__rq;
 	int ret;
@@ -1656,7 +1656,7 @@ out_insert:
 
 static void as_merged_request(request_queue_t *q, struct request *req)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = RQ_DATA(req);
 
 	/*
@@ -1701,7 +1701,7 @@ static void
 as_merged_requests(request_queue_t *q, struct request *req,
 			 struct request *next)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = RQ_DATA(req);
 	struct as_rq *anext = RQ_DATA(next);
 
@@ -1788,7 +1788,7 @@ static void as_work_handler(void *data)
 
 static void as_put_request(request_queue_t *q, struct request *rq)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = RQ_DATA(rq);
 
 	if (!arq) {
@@ -1807,7 +1807,7 @@ static void as_put_request(request_queue_t *q, struct request *rq)
 
 static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 {
-	struct as_data *ad = q->elevator.elevator_data;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask);
 
 	if (arq) {
@@ -1828,21 +1828,21 @@ static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 
 static int as_may_queue(request_queue_t *q, int rw)
 {
-	int ret = 0;
-	struct as_data *ad = q->elevator.elevator_data;
+	int ret = ELV_MQUEUE_MAY;
+	struct as_data *ad = q->elevator->elevator_data;
 	struct io_context *ioc;
 	if (ad->antic_status == ANTIC_WAIT_REQ ||
 			ad->antic_status == ANTIC_WAIT_NEXT) {
 		ioc = as_get_io_context();
 		if (ad->io_context == ioc)
-			ret = 1;
+			ret = ELV_MQUEUE_MUST;
 		put_io_context(ioc);
 	}
 
 	return ret;
 }
 
-static void as_exit(request_queue_t *q, elevator_t *e)
+static void as_exit_queue(elevator_t *e)
 {
 	struct as_data *ad = e->elevator_data;
 
@@ -1862,7 +1862,7 @@ static void as_exit(request_queue_t *q, elevator_t *e)
  * initialize elevator private data (as_data), and alloc a arq for
  * each request on the free lists
  */
-static int as_init(request_queue_t *q, elevator_t *e)
+static int as_init_queue(request_queue_t *q, elevator_t *e)
 {
 	struct as_data *ad;
 	int i;
@@ -1962,10 +1962,10 @@ static ssize_t as_est_show(struct as_data *ad, char *page)
 	return pos;
 }
 
-#define SHOW_FUNCTION(__FUNC, __VAR)					\
+#define SHOW_FUNCTION(__FUNC, __VAR)				\
 static ssize_t __FUNC(struct as_data *ad, char *page)		\
-{									\
-	return as_var_show(__VAR, (page));			\
+{								\
+	return as_var_show(jiffies_to_msecs((__VAR)), (page));	\
 }
 SHOW_FUNCTION(as_readexpire_show, ad->fifo_expire[REQ_SYNC]);
 SHOW_FUNCTION(as_writeexpire_show, ad->fifo_expire[REQ_ASYNC]);
@@ -1982,6 +1982,7 @@ static ssize_t __FUNC(struct as_data *ad, const char *page, size_t count)	\
 		*(__PTR) = (MIN);					\
 	else if (*(__PTR) > (MAX))					\
 		*(__PTR) = (MAX);					\
+	*(__PTR) = msecs_to_jiffies(*(__PTR));				\
 	return ret;							\
 }
 STORE_FUNCTION(as_readexpire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX);
@@ -2070,39 +2071,64 @@ static struct kobj_type as_ktype = {
 	.default_attrs	= default_attrs,
 };
 
-static int __init as_slab_setup(void)
+static struct elevator_type iosched_as = {
+	.ops = {
+		.elevator_merge_fn = 		as_merge,
+		.elevator_merged_fn =		as_merged_request,
+		.elevator_merge_req_fn =	as_merged_requests,
+		.elevator_next_req_fn =		as_next_request,
+		.elevator_add_req_fn =		as_insert_request,
+		.elevator_remove_req_fn =	as_remove_request,
+		.elevator_requeue_req_fn = 	as_requeue_request,
+		.elevator_queue_empty_fn =	as_queue_empty,
+		.elevator_completed_req_fn =	as_completed_request,
+		.elevator_former_req_fn =	as_former_request,
+		.elevator_latter_req_fn =	as_latter_request,
+		.elevator_set_req_fn =		as_set_request,
+		.elevator_put_req_fn =		as_put_request,
+		.elevator_may_queue_fn =	as_may_queue,
+		.elevator_init_fn =		as_init_queue,
+		.elevator_exit_fn =		as_exit_queue,
+	},
+
+	.elevator_ktype = &as_ktype,
+	.elevator_name = "anticipatory",
+	.elevator_owner = THIS_MODULE,
+};
+
+int as_init(void)
 {
+	int ret;
+
 	arq_pool = kmem_cache_create("as_arq", sizeof(struct as_rq),
 				     0, 0, NULL, NULL);
-
 	if (!arq_pool)
-		panic("as: can't init slab pool\n");
+		return -ENOMEM;
 
-	return 0;
+	ret = elv_register(&iosched_as);
+	if (!ret) {
+		/*
+		 * don't allow AS to get unregistered, since we would have
+		 * to browse all tasks in the system and release their
+		 * as_io_context first
+		 */
+		__module_get(THIS_MODULE);
+		return 0;
+	}
+
+	kmem_cache_destroy(arq_pool);
+	return ret;
 }
 
-subsys_initcall(as_slab_setup);
-
-elevator_t iosched_as = {
-	.elevator_merge_fn = 		as_merge,
-	.elevator_merged_fn =		as_merged_request,
-	.elevator_merge_req_fn =	as_merged_requests,
-	.elevator_next_req_fn =		as_next_request,
-	.elevator_add_req_fn =		as_insert_request,
-	.elevator_remove_req_fn =	as_remove_request,
-	.elevator_requeue_req_fn = 	as_requeue_request,
-	.elevator_queue_empty_fn =	as_queue_empty,
-	.elevator_completed_req_fn =	as_completed_request,
-	.elevator_former_req_fn =	as_former_request,
-	.elevator_latter_req_fn =	as_latter_request,
-	.elevator_set_req_fn =		as_set_request,
-	.elevator_put_req_fn =		as_put_request,
-	.elevator_may_queue_fn =	as_may_queue,
-	.elevator_init_fn =		as_init,
-	.elevator_exit_fn =		as_exit,
-
-	.elevator_ktype =		&as_ktype,
-	.elevator_name =		"anticipatory",
-};
+void as_exit(void)
+{
+	kmem_cache_destroy(arq_pool);
+	elv_unregister(&iosched_as);
+}
+
+module_init(as_init);
+module_exit(as_exit);
 
-EXPORT_SYMBOL(iosched_as);
+MODULE_AUTHOR("Nick Piggin");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("anticipatory IO scheduler");
diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c
index 068f4eae0b5c..cf7fc7609e67 100644
--- a/drivers/block/cfq-iosched.c
+++ b/drivers/block/cfq-iosched.c
@@ -22,96 +22,216 @@
 #include <linux/rbtree.h>
 #include <linux/mempool.h>
 
+static unsigned long max_elapsed_crq;
+static unsigned long max_elapsed_dispatch;
+
 /*
  * tunables
  */
-static int cfq_quantum = 4;
-static int cfq_queued = 8;
+static int cfq_quantum = 4;		/* max queue in one round of service */
+static int cfq_queued = 8;		/* minimum rq allocate limit per-queue*/
+static int cfq_service = HZ;		/* period over which service is avg */
+static int cfq_fifo_expire_r = HZ / 2;	/* fifo timeout for sync requests */
+static int cfq_fifo_expire_w = 5 * HZ;	/* fifo timeout for async requests */
+static int cfq_fifo_rate = HZ / 8;	/* fifo expiry rate */
+static int cfq_back_max = 16 * 1024;	/* maximum backwards seek, in KiB */
+static int cfq_back_penalty = 2;	/* penalty of a backwards seek */
 
+/*
+ * for the hash of cfqq inside the cfqd
+ */
 #define CFQ_QHASH_SHIFT		6
 #define CFQ_QHASH_ENTRIES	(1 << CFQ_QHASH_SHIFT)
-#define list_entry_qhash(entry)	list_entry((entry), struct cfq_queue, cfq_hash)
+#define list_entry_qhash(entry)	hlist_entry((entry), struct cfq_queue, cfq_hash)
 
-#define CFQ_MHASH_SHIFT		8
+/*
+ * for the hash of crq inside the cfqq
+ */
+#define CFQ_MHASH_SHIFT		6
 #define CFQ_MHASH_BLOCK(sec)	((sec) >> 3)
 #define CFQ_MHASH_ENTRIES	(1 << CFQ_MHASH_SHIFT)
-#define CFQ_MHASH_FN(sec)	(hash_long(CFQ_MHASH_BLOCK((sec)),CFQ_MHASH_SHIFT))
-#define ON_MHASH(crq)		!list_empty(&(crq)->hash)
+#define CFQ_MHASH_FN(sec)	hash_long(CFQ_MHASH_BLOCK(sec), CFQ_MHASH_SHIFT)
 #define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
-#define list_entry_hash(ptr)	list_entry((ptr), struct cfq_rq, hash)
+#define list_entry_hash(ptr)	hlist_entry((ptr), struct cfq_rq, hash)
 
 #define list_entry_cfqq(ptr)	list_entry((ptr), struct cfq_queue, cfq_list)
 
-#define RQ_DATA(rq)		((struct cfq_rq *) (rq)->elevator_private)
+#define RQ_DATA(rq)		(rq)->elevator_private
+
+/*
+ * rb-tree defines
+ */
+#define RB_NONE			(2)
+#define RB_EMPTY(node)		((node)->rb_node == NULL)
+#define RB_CLEAR_COLOR(node)	(node)->rb_color = RB_NONE
+#define RB_CLEAR(node)		do {	\
+	(node)->rb_parent = NULL;	\
+	RB_CLEAR_COLOR((node));		\
+	(node)->rb_right = NULL;	\
+	(node)->rb_left = NULL;		\
+} while (0)
+#define RB_CLEAR_ROOT(root)	((root)->rb_node = NULL)
+#define ON_RB(node)		((node)->rb_color != RB_NONE)
+#define rb_entry_crq(node)	rb_entry((node), struct cfq_rq, rb_node)
+#define rq_rb_key(rq)		(rq)->sector
+
+/*
+ * threshold for switching off non-tag accounting
+ */
+#define CFQ_MAX_TAG		(4)
+
+/*
+ * sort key types and names
+ */
+enum {
+	CFQ_KEY_PGID,
+	CFQ_KEY_TGID,
+	CFQ_KEY_UID,
+	CFQ_KEY_GID,
+	CFQ_KEY_LAST,
+};
+
+static char *cfq_key_types[] = { "pgid", "tgid", "uid", "gid", NULL };
+
+/*
+ * spare queue
+ */
+#define CFQ_KEY_SPARE		(~0UL)
 
 static kmem_cache_t *crq_pool;
 static kmem_cache_t *cfq_pool;
-static mempool_t *cfq_mpool;
+static kmem_cache_t *cfq_ioc_pool;
 
 struct cfq_data {
 	struct list_head rr_list;
-	struct list_head *dispatch;
-	struct list_head *cfq_hash;
+	struct list_head empty_list;
 
-	struct list_head *crq_hash;
+	struct hlist_head *cfq_hash;
+	struct hlist_head *crq_hash;
 
+	/* queues on rr_list (ie they have pending requests */
 	unsigned int busy_queues;
+
 	unsigned int max_queued;
 
+	atomic_t ref;
+
+	int key_type;
+
 	mempool_t *crq_pool;
 
 	request_queue_t *queue;
 
+	sector_t last_sector;
+
+	int rq_in_driver;
+
 	/*
-	 * tunables
+	 * tunables, see top of file
 	 */
 	unsigned int cfq_quantum;
 	unsigned int cfq_queued;
+	unsigned int cfq_fifo_expire_r;
+	unsigned int cfq_fifo_expire_w;
+	unsigned int cfq_fifo_batch_expire;
+	unsigned int cfq_back_penalty;
+	unsigned int cfq_back_max;
+	unsigned int find_best_crq;
+
+	unsigned int cfq_tagged;
 };
 
 struct cfq_queue {
-	struct list_head cfq_hash;
+	/* reference count */
+	atomic_t ref;
+	/* parent cfq_data */
+	struct cfq_data *cfqd;
+	/* hash of mergeable requests */
+	struct hlist_node cfq_hash;
+	/* hash key */
+	unsigned long key;
+	/* whether queue is on rr (or empty) list */
+	int on_rr;
+	/* on either rr or empty list of cfqd */
 	struct list_head cfq_list;
+	/* sorted list of pending requests */
 	struct rb_root sort_list;
-	int pid;
+	/* if fifo isn't expired, next request to serve */
+	struct cfq_rq *next_crq;
+	/* requests queued in sort_list */
 	int queued[2];
-#if 0
-	/*
-	 * with a simple addition like this, we can do io priorities. almost.
-	 * does need a split request free list, too.
-	 */
-	int io_prio
-#endif
+	/* currently allocated requests */
+	int allocated[2];
+	/* fifo list of requests in sort_list */
+	struct list_head fifo[2];
+	/* last time fifo expired */
+	unsigned long last_fifo_expire;
+
+	int key_type;
+
+	unsigned long service_start;
+	unsigned long service_used;
+
+	unsigned int max_rate;
+
+	/* number of requests that have been handed to the driver */
+	int in_flight;
+	/* number of currently allocated requests */
+	int alloc_limit[2];
 };
 
 struct cfq_rq {
 	struct rb_node rb_node;
 	sector_t rb_key;
-
 	struct request *request;
+	struct hlist_node hash;
 
 	struct cfq_queue *cfq_queue;
+	struct cfq_io_context *io_context;
+
+	unsigned long service_start;
+	unsigned long queue_start;
 
-	struct list_head hash;
+	unsigned int in_flight : 1;
+	unsigned int accounted : 1;
+	unsigned int is_sync   : 1;
+	unsigned int is_write  : 1;
 };
 
-static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq);
-static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid);
-static void cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-			      struct cfq_rq *crq);
+static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned long);
+static void cfq_dispatch_sort(request_queue_t *, struct cfq_rq *);
+static void cfq_update_next_crq(struct cfq_rq *);
+static void cfq_put_cfqd(struct cfq_data *cfqd);
 
 /*
- * lots of deadline iosched dupes, can be abstracted later...
+ * what the fairness is based on (ie how processes are grouped and
+ * differentiated)
  */
-static inline void __cfq_del_crq_hash(struct cfq_rq *crq)
+static inline unsigned long
+cfq_hash_key(struct cfq_data *cfqd, struct task_struct *tsk)
 {
-	list_del_init(&crq->hash);
+	/*
+	 * optimize this so that ->key_type is the offset into the struct
+	 */
+	switch (cfqd->key_type) {
+		case CFQ_KEY_PGID:
+			return process_group(tsk);
+		default:
+		case CFQ_KEY_TGID:
+			return tsk->tgid;
+		case CFQ_KEY_UID:
+			return tsk->uid;
+		case CFQ_KEY_GID:
+			return tsk->gid;
+	}
 }
 
+/*
+ * lots of deadline iosched dupes, can be abstracted later...
+ */
 static inline void cfq_del_crq_hash(struct cfq_rq *crq)
 {
-	if (ON_MHASH(crq))
-		__cfq_del_crq_hash(crq);
+	hlist_del_init(&crq->hash);
 }
 
 static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
@@ -120,32 +240,32 @@ static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
 
 	if (q->last_merge == crq->request)
 		q->last_merge = NULL;
+
+	cfq_update_next_crq(crq);
 }
 
 static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
 {
-	struct request *rq = crq->request;
+	const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request));
 
-	BUG_ON(ON_MHASH(crq));
+	BUG_ON(!hlist_unhashed(&crq->hash));
 
-	list_add(&crq->hash, &cfqd->crq_hash[CFQ_MHASH_FN(rq_hash_key(rq))]);
+	hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
 }
 
 static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
 {
-	struct list_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
-	struct list_head *entry, *next = hash_list->next;
+	struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
+	struct hlist_node *entry, *next;
 
-	while ((entry = next) != hash_list) {
+	hlist_for_each_safe(entry, next, hash_list) {
 		struct cfq_rq *crq = list_entry_hash(entry);
 		struct request *__rq = crq->request;
 
-		next = entry->next;
-
-		BUG_ON(!ON_MHASH(crq));
+		BUG_ON(hlist_unhashed(&crq->hash));
 
 		if (!rq_mergeable(__rq)) {
-			__cfq_del_crq_hash(crq);
+			cfq_del_crq_hash(crq);
 			continue;
 		}
 
@@ -157,29 +277,257 @@ static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
 }
 
 /*
- * rb tree support functions
+ * Lifted from AS - choose which of crq1 and crq2 that is best served now.
+ * We choose the request that is closest to the head right now. Distance
+ * behind the head are penalized and only allowed to a certain extent.
  */
-#define RB_NONE		(2)
-#define RB_EMPTY(node)	((node)->rb_node == NULL)
-#define RB_CLEAR(node)	((node)->rb_color = RB_NONE)
-#define RB_CLEAR_ROOT(root)	((root)->rb_node = NULL)
-#define ON_RB(node)	((node)->rb_color != RB_NONE)
-#define rb_entry_crq(node)	rb_entry((node), struct cfq_rq, rb_node)
-#define rq_rb_key(rq)		(rq)->sector
+static struct cfq_rq *
+cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
+{
+	sector_t last, s1, s2, d1 = 0, d2 = 0;
+	int r1_wrap = 0, r2_wrap = 0;	/* requests are behind the disk head */
+	unsigned long back_max;
+
+	if (crq1 == NULL || crq1 == crq2)
+		return crq2;
+	if (crq2 == NULL)
+		return crq1;
 
-static inline void cfq_del_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
+	s1 = crq1->request->sector;
+	s2 = crq2->request->sector;
+
+	last = cfqd->last_sector;
+
+#if 0
+	if (!list_empty(&cfqd->queue->queue_head)) {
+		struct list_head *entry = &cfqd->queue->queue_head;
+		unsigned long distance = ~0UL;
+		struct request *rq;
+
+		while ((entry = entry->prev) != &cfqd->queue->queue_head) {
+			rq = list_entry_rq(entry);
+
+			if (blk_barrier_rq(rq))
+				break;
+
+			if (distance < abs(s1 - rq->sector + rq->nr_sectors)) {
+				distance = abs(s1 - rq->sector +rq->nr_sectors);
+				last = rq->sector + rq->nr_sectors;
+			}
+			if (distance < abs(s2 - rq->sector + rq->nr_sectors)) {
+				distance = abs(s2 - rq->sector +rq->nr_sectors);
+				last = rq->sector + rq->nr_sectors;
+			}
+		}
+	}
+#endif
+
+	/*
+	 * by definition, 1KiB is 2 sectors
+	 */
+	back_max = cfqd->cfq_back_max * 2;
+
+	/*
+	 * Strict one way elevator _except_ in the case where we allow
+	 * short backward seeks which are biased as twice the cost of a
+	 * similar forward seek.
+	 */
+	if (s1 >= last)
+		d1 = s1 - last;
+	else if (s1 + back_max >= last)
+		d1 = (last - s1) * cfqd->cfq_back_penalty;
+	else
+		r1_wrap = 1;
+
+	if (s2 >= last)
+		d2 = s2 - last;
+	else if (s2 + back_max >= last)
+		d2 = (last - s2) * cfqd->cfq_back_penalty;
+	else
+		r2_wrap = 1;
+
+	/* Found required data */
+	if (!r1_wrap && r2_wrap)
+		return crq1;
+	else if (!r2_wrap && r1_wrap)
+		return crq2;
+	else if (r1_wrap && r2_wrap) {
+		/* both behind the head */
+		if (s1 <= s2)
+			return crq1;
+		else
+			return crq2;
+	}
+
+	/* Both requests in front of the head */
+	if (d1 < d2)
+		return crq1;
+	else if (d2 < d1)
+		return crq2;
+	else {
+		if (s1 >= s2)
+			return crq1;
+		else
+			return crq2;
+	}
+}
+
+/*
+ * would be nice to take fifo expire time into account as well
+ */
+static struct cfq_rq *
+cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+		  struct cfq_rq *last)
+{
+	struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
+	struct rb_node *rbnext, *rbprev;
+
+	if (!ON_RB(&last->rb_node))
+		return NULL;
+
+	if ((rbnext = rb_next(&last->rb_node)) == NULL)
+		rbnext = rb_first(&cfqq->sort_list);
+
+	rbprev = rb_prev(&last->rb_node);
+
+	if (rbprev)
+		crq_prev = rb_entry_crq(rbprev);
+	if (rbnext)
+		crq_next = rb_entry_crq(rbnext);
+
+	return cfq_choose_req(cfqd, crq_next, crq_prev);
+}
+
+static void cfq_update_next_crq(struct cfq_rq *crq)
 {
+	struct cfq_queue *cfqq = crq->cfq_queue;
+
+	if (cfqq->next_crq == crq)
+		cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
+}
+
+static int cfq_check_sort_rr_list(struct cfq_queue *cfqq)
+{
+	struct list_head *head = &cfqq->cfqd->rr_list;
+	struct list_head *next, *prev;
+
+	/*
+	 * list might still be ordered
+	 */
+	next = cfqq->cfq_list.next;
+	if (next != head) {
+		struct cfq_queue *cnext = list_entry_cfqq(next);
+
+		if (cfqq->service_used > cnext->service_used)
+			return 1;
+	}
+
+	prev = cfqq->cfq_list.prev;
+	if (prev != head) {
+		struct cfq_queue *cprev = list_entry_cfqq(prev);
+
+		if (cfqq->service_used < cprev->service_used)
+			return 1;
+	}
+
+	return 0;
+}
+
+static void cfq_sort_rr_list(struct cfq_queue *cfqq, int new_queue)
+{
+	struct list_head *entry = &cfqq->cfqd->rr_list;
+
+	if (!cfqq->on_rr)
+		return;
+	if (!new_queue && !cfq_check_sort_rr_list(cfqq))
+		return;
+
+	list_del(&cfqq->cfq_list);
+
+	/*
+	 * sort by our mean service_used, sub-sort by in-flight requests
+	 */
+	while ((entry = entry->prev) != &cfqq->cfqd->rr_list) {
+		struct cfq_queue *__cfqq = list_entry_cfqq(entry);
+
+		if (cfqq->service_used > __cfqq->service_used)
+			break;
+		else if (cfqq->service_used == __cfqq->service_used) {
+			struct list_head *prv;
+
+			while ((prv = entry->prev) != &cfqq->cfqd->rr_list) {
+				__cfqq = list_entry_cfqq(prv);
+
+				WARN_ON(__cfqq->service_used > cfqq->service_used);
+				if (cfqq->service_used != __cfqq->service_used)
+					break;
+				if (cfqq->in_flight > __cfqq->in_flight)
+					break;
+
+				entry = prv;
+			}
+		}
+	}
+
+	list_add(&cfqq->cfq_list, entry);
+}
+
+/*
+ * add to busy list of queues for service, trying to be fair in ordering
+ * the pending list according to requests serviced
+ */
+static inline void
+cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	/*
+	 * it's currently on the empty list
+	 */
+	cfqq->on_rr = 1;
+	cfqd->busy_queues++;
+
+	if (time_after(jiffies, cfqq->service_start + cfq_service))
+		cfqq->service_used >>= 3;
+
+	cfq_sort_rr_list(cfqq, 1);
+}
+
+static inline void
+cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	list_move(&cfqq->cfq_list, &cfqd->empty_list);
+	cfqq->on_rr = 0;
+
+	BUG_ON(!cfqd->busy_queues);
+	cfqd->busy_queues--;
+}
+
+/*
+ * rb tree support functions
+ */
+static inline void cfq_del_crq_rb(struct cfq_rq *crq)
+{
+	struct cfq_queue *cfqq = crq->cfq_queue;
+
 	if (ON_RB(&crq->rb_node)) {
-		cfqq->queued[rq_data_dir(crq->request)]--;
+		struct cfq_data *cfqd = cfqq->cfqd;
+
+		BUG_ON(!cfqq->queued[crq->is_sync]);
+
+		cfq_update_next_crq(crq);
+
+		cfqq->queued[crq->is_sync]--;
 		rb_erase(&crq->rb_node, &cfqq->sort_list);
-		crq->cfq_queue = NULL;
+		RB_CLEAR_COLOR(&crq->rb_node);
+
+		if (RB_EMPTY(&cfqq->sort_list) && cfqq->on_rr)
+			cfq_del_cfqq_rr(cfqd, cfqq);
 	}
 }
 
 static struct cfq_rq *
-__cfq_add_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
+__cfq_add_crq_rb(struct cfq_rq *crq)
 {
-	struct rb_node **p = &cfqq->sort_list.rb_node;
+	struct rb_node **p = &crq->cfq_queue->sort_list.rb_node;
 	struct rb_node *parent = NULL;
 	struct cfq_rq *__crq;
 
@@ -199,30 +547,50 @@ __cfq_add_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
 	return NULL;
 }
 
-static void
-cfq_add_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq)
+static void cfq_add_crq_rb(struct cfq_rq *crq)
 {
+	struct cfq_queue *cfqq = crq->cfq_queue;
+	struct cfq_data *cfqd = cfqq->cfqd;
 	struct request *rq = crq->request;
 	struct cfq_rq *__alias;
 
 	crq->rb_key = rq_rb_key(rq);
-	cfqq->queued[rq_data_dir(rq)]++;
-retry:
-	__alias = __cfq_add_crq_rb(cfqq, crq);
-	if (!__alias) {
-		rb_insert_color(&crq->rb_node, &cfqq->sort_list);
-		crq->cfq_queue = cfqq;
-		return;
+	cfqq->queued[crq->is_sync]++;
+
+	/*
+	 * looks a little odd, but the first insert might return an alias.
+	 * if that happens, put the alias on the dispatch list
+	 */
+	while ((__alias = __cfq_add_crq_rb(crq)) != NULL)
+		cfq_dispatch_sort(cfqd->queue, __alias);
+
+	rb_insert_color(&crq->rb_node, &cfqq->sort_list);
+
+	if (!cfqq->on_rr)
+		cfq_add_cfqq_rr(cfqd, cfqq);
+
+	/*
+	 * check if this request is a better next-serve candidate
+	 */
+	cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
+}
+
+static inline void
+cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
+{
+	if (ON_RB(&crq->rb_node)) {
+		rb_erase(&crq->rb_node, &cfqq->sort_list);
+		cfqq->queued[crq->is_sync]--;
 	}
 
-	cfq_dispatch_sort(cfqd, cfqq, __alias);
-	goto retry;
+	cfq_add_crq_rb(crq);
 }
 
 static struct request *
 cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
 {
-	struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->tgid);
+	const unsigned long key = cfq_hash_key(cfqd, current);
+	struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, key);
 	struct rb_node *n;
 
 	if (!cfqq)
@@ -244,30 +612,44 @@ out:
 	return NULL;
 }
 
-static void cfq_remove_request(request_queue_t *q, struct request *rq)
+/*
+ * make sure the service time gets corrected on reissue of this request
+ */
+static void cfq_requeue_request(request_queue_t *q, struct request *rq)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
 	struct cfq_rq *crq = RQ_DATA(rq);
 
 	if (crq) {
 		struct cfq_queue *cfqq = crq->cfq_queue;
 
+		if (cfqq->cfqd->cfq_tagged) {
+			cfqq->service_used--;
+			cfq_sort_rr_list(cfqq, 0);
+		}
+
+		crq->accounted = 0;
+		cfqq->cfqd->rq_in_driver--;
+	}
+	list_add(&rq->queuelist, &q->queue_head);
+}
+
+static void cfq_remove_request(request_queue_t *q, struct request *rq)
+{
+	struct cfq_rq *crq = RQ_DATA(rq);
+
+	if (crq) {
 		cfq_remove_merge_hints(q, crq);
 		list_del_init(&rq->queuelist);
 
-		if (cfqq) {
-			cfq_del_crq_rb(cfqq, crq);
-
-			if (RB_EMPTY(&cfqq->sort_list))
-				cfq_put_queue(cfqd, cfqq);
-		}
+		if (crq->cfq_queue)
+			cfq_del_crq_rb(crq);
 	}
 }
 
 static int
 cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct request *__rq;
 	int ret;
 
@@ -305,7 +687,7 @@ out_insert:
 
 static void cfq_merged_request(request_queue_t *q, struct request *req)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_rq *crq = RQ_DATA(req);
 
 	cfq_del_crq_hash(crq);
@@ -314,193 +696,546 @@ static void cfq_merged_request(request_queue_t *q, struct request *req)
 	if (ON_RB(&crq->rb_node) && (rq_rb_key(req) != crq->rb_key)) {
 		struct cfq_queue *cfqq = crq->cfq_queue;
 
-		cfq_del_crq_rb(cfqq, crq);
-		cfq_add_crq_rb(cfqd, cfqq, crq);
+		cfq_update_next_crq(crq);
+		cfq_reposition_crq_rb(cfqq, crq);
 	}
 
 	q->last_merge = req;
 }
 
 static void
-cfq_merged_requests(request_queue_t *q, struct request *req,
+cfq_merged_requests(request_queue_t *q, struct request *rq,
 		    struct request *next)
 {
-	cfq_merged_request(q, req);
+	struct cfq_rq *crq = RQ_DATA(rq);
+	struct cfq_rq *cnext = RQ_DATA(next);
+
+	cfq_merged_request(q, rq);
+
+	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist)) {
+		if (time_before(cnext->queue_start, crq->queue_start)) {
+			list_move(&rq->queuelist, &next->queuelist);
+			crq->queue_start = cnext->queue_start;
+		}
+	}
+
+	cfq_update_next_crq(cnext);
 	cfq_remove_request(q, next);
 }
 
-static void
-cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-		  struct cfq_rq *crq)
+/*
+ * we dispatch cfqd->cfq_quantum requests in total from the rr_list queues,
+ * this function sector sorts the selected request to minimize seeks. we start
+ * at cfqd->last_sector, not 0.
+ */
+static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq)
 {
-	struct list_head *head = cfqd->dispatch, *entry = head;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
+	struct cfq_queue *cfqq = crq->cfq_queue;
+	struct list_head *head = &q->queue_head, *entry = head;
 	struct request *__rq;
+	sector_t last;
 
-	cfq_del_crq_rb(cfqq, crq);
-	cfq_remove_merge_hints(cfqd->queue, crq);
+	cfq_del_crq_rb(crq);
+	cfq_remove_merge_hints(q, crq);
+	list_del(&crq->request->queuelist);
 
-	if (!list_empty(head)) {
-		__rq = list_entry_rq(head->next);
+	last = cfqd->last_sector;
+	while ((entry = entry->prev) != head) {
+		__rq = list_entry_rq(entry);
 
-		if (crq->request->sector < __rq->sector) {
-			entry = head->prev;
-			goto link;
+		if (blk_barrier_rq(crq->request))
+			break;
+		if (!blk_fs_request(crq->request))
+			break;
+
+		if (crq->request->sector > __rq->sector)
+			break;
+		if (__rq->sector > last && crq->request->sector < last) {
+			last = crq->request->sector;
+			break;
 		}
 	}
 
-	while ((entry = entry->prev) != head) {
-		__rq = list_entry_rq(entry);
+	cfqd->last_sector = last;
+	crq->in_flight = 1;
+	cfqq->in_flight++;
+	list_add(&crq->request->queuelist, entry);
+}
 
-		if (crq->request->sector <= __rq->sector)
-			break;
+/*
+ * return expired entry, or NULL to just start from scratch in rbtree
+ */
+static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
+{
+	struct cfq_data *cfqd = cfqq->cfqd;
+	const int reads = !list_empty(&cfqq->fifo[0]);
+	const int writes = !list_empty(&cfqq->fifo[1]);
+	unsigned long now = jiffies;
+	struct cfq_rq *crq;
+
+	if (time_before(now, cfqq->last_fifo_expire + cfqd->cfq_fifo_batch_expire))
+		return NULL;
+
+	crq = RQ_DATA(list_entry(cfqq->fifo[0].next, struct request, queuelist));
+	if (reads && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_r)) {
+		cfqq->last_fifo_expire = now;
+		return crq;
+	}
+
+	crq = RQ_DATA(list_entry(cfqq->fifo[1].next, struct request, queuelist));
+	if (writes && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_w)) {
+		cfqq->last_fifo_expire = now;
+		return crq;
 	}
 
-link:
-	list_add_tail(&crq->request->queuelist, entry);
+	return NULL;
 }
 
+/*
+ * dispatch a single request from given queue
+ */
 static inline void
-__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd,
-			struct cfq_queue *cfqq)
+cfq_dispatch_request(request_queue_t *q, struct cfq_data *cfqd,
+		     struct cfq_queue *cfqq)
 {
-	struct cfq_rq *crq = rb_entry_crq(rb_first(&cfqq->sort_list));
+	struct cfq_rq *crq;
+
+	/*
+	 * follow expired path, else get first next available
+	 */
+	if ((crq = cfq_check_fifo(cfqq)) == NULL) {
+		if (cfqd->find_best_crq)
+			crq = cfqq->next_crq;
+		else
+			crq = rb_entry_crq(rb_first(&cfqq->sort_list));
+	}
+
+	cfqd->last_sector = crq->request->sector + crq->request->nr_sectors;
 
-	cfq_dispatch_sort(cfqd, cfqq, crq);
+	/*
+	 * finally, insert request into driver list
+	 */
+	cfq_dispatch_sort(q, crq);
 }
 
-static int cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd)
+static int cfq_dispatch_requests(request_queue_t *q, int max_dispatch)
 {
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq;
 	struct list_head *entry, *tmp;
-	int ret, queued, good_queues;
+	int queued, busy_queues, first_round;
 
 	if (list_empty(&cfqd->rr_list))
 		return 0;
 
-	queued = ret = 0;
+	queued = 0;
+	first_round = 1;
 restart:
-	good_queues = 0;
+	busy_queues = 0;
 	list_for_each_safe(entry, tmp, &cfqd->rr_list) {
-		cfqq = list_entry_cfqq(cfqd->rr_list.next);
+		cfqq = list_entry_cfqq(entry);
 
 		BUG_ON(RB_EMPTY(&cfqq->sort_list));
 
-		__cfq_dispatch_requests(q, cfqd, cfqq);
+		/*
+		 * first round of queueing, only select from queues that
+		 * don't already have io in-flight
+		 */
+		if (first_round && cfqq->in_flight)
+			continue;
 
-		if (RB_EMPTY(&cfqq->sort_list))
-			cfq_put_queue(cfqd, cfqq);
-		else
-			good_queues++;
+		cfq_dispatch_request(q, cfqd, cfqq);
+
+		if (!RB_EMPTY(&cfqq->sort_list))
+			busy_queues++;
 
 		queued++;
-		ret = 1;
 	}
 
-	if ((queued < cfqd->cfq_quantum) && good_queues)
+	if ((queued < max_dispatch) && (busy_queues || first_round)) {
+		first_round = 0;
 		goto restart;
+	}
 
-	return ret;
+	return queued;
+}
+
+static inline void cfq_account_dispatch(struct cfq_rq *crq)
+{
+	struct cfq_queue *cfqq = crq->cfq_queue;
+	struct cfq_data *cfqd = cfqq->cfqd;
+	unsigned long now, elapsed;
+
+	/*
+	 * accounted bit is necessary since some drivers will call
+	 * elv_next_request() many times for the same request (eg ide)
+	 */
+	if (crq->accounted)
+		return;
+
+	now = jiffies;
+	if (cfqq->service_start == ~0UL)
+		cfqq->service_start = now;
+
+	/*
+	 * on drives with tagged command queueing, command turn-around time
+	 * doesn't necessarily reflect the time spent processing this very
+	 * command inside the drive. so do the accounting differently there,
+	 * by just sorting on the number of requests
+	 */
+	if (cfqd->cfq_tagged) {
+		if (time_after(now, cfqq->service_start + cfq_service)) {
+			cfqq->service_start = now;
+			cfqq->service_used /= 10;
+		}
+
+		cfqq->service_used++;
+		cfq_sort_rr_list(cfqq, 0);
+	}
+
+	elapsed = now - crq->queue_start;
+	if (elapsed > max_elapsed_dispatch)
+		max_elapsed_dispatch = elapsed;
+
+	crq->accounted = 1;
+	crq->service_start = now;
+
+	if (++cfqd->rq_in_driver >= CFQ_MAX_TAG && !cfqd->cfq_tagged) {
+		cfqq->cfqd->cfq_tagged = 1;
+		printk("cfq: depth %d reached, tagging now on\n", CFQ_MAX_TAG);
+	}
+}
+
+static inline void
+cfq_account_completion(struct cfq_queue *cfqq, struct cfq_rq *crq)
+{
+	struct cfq_data *cfqd = cfqq->cfqd;
+
+	WARN_ON(!cfqd->rq_in_driver);
+	cfqd->rq_in_driver--;
+
+	if (!cfqd->cfq_tagged) {
+		unsigned long now = jiffies;
+		unsigned long duration = now - crq->service_start;
+
+		if (time_after(now, cfqq->service_start + cfq_service)) {
+			cfqq->service_start = now;
+			cfqq->service_used >>= 3;
+		}
+
+		cfqq->service_used += duration;
+		cfq_sort_rr_list(cfqq, 0);
+
+		if (duration > max_elapsed_crq)
+			max_elapsed_crq = duration;
+	}
 }
 
 static struct request *cfq_next_request(request_queue_t *q)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct request *rq;
 
-	if (!list_empty(cfqd->dispatch)) {
+	if (!list_empty(&q->queue_head)) {
 		struct cfq_rq *crq;
 dispatch:
-		rq = list_entry_rq(cfqd->dispatch->next);
+		rq = list_entry_rq(q->queue_head.next);
 
-		crq = RQ_DATA(rq);
-		if (crq)
+		if ((crq = RQ_DATA(rq)) != NULL) {
 			cfq_remove_merge_hints(q, crq);
+			cfq_account_dispatch(crq);
+		}
 
 		return rq;
 	}
 
-	if (cfq_dispatch_requests(q, cfqd))
+	if (cfq_dispatch_requests(q, cfqd->cfq_quantum))
 		goto dispatch;
 
 	return NULL;
 }
 
+/*
+ * task holds one reference to the queue, dropped when task exits. each crq
+ * in-flight on this queue also holds a reference, dropped when crq is freed.
+ *
+ * queue lock must be held here.
+ */
+static void cfq_put_queue(struct cfq_queue *cfqq)
+{
+	BUG_ON(!atomic_read(&cfqq->ref));
+
+	if (!atomic_dec_and_test(&cfqq->ref))
+		return;
+
+	BUG_ON(rb_first(&cfqq->sort_list));
+	BUG_ON(cfqq->on_rr);
+
+	cfq_put_cfqd(cfqq->cfqd);
+
+	/*
+	 * it's on the empty list and still hashed
+	 */
+	list_del(&cfqq->cfq_list);
+	hlist_del(&cfqq->cfq_hash);
+	kmem_cache_free(cfq_pool, cfqq);
+}
+
 static inline struct cfq_queue *
-__cfq_find_cfq_hash(struct cfq_data *cfqd, int pid, const int hashval)
+__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key, const int hashval)
 {
-	struct list_head *hash_list = &cfqd->cfq_hash[hashval];
-	struct list_head *entry;
+	struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
+	struct hlist_node *entry, *next;
 
-	list_for_each(entry, hash_list) {
+	hlist_for_each_safe(entry, next, hash_list) {
 		struct cfq_queue *__cfqq = list_entry_qhash(entry);
 
-		if (__cfqq->pid == pid)
+		if (__cfqq->key == key)
 			return __cfqq;
 	}
 
 	return NULL;
 }
 
-static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid)
+static struct cfq_queue *
+cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key)
 {
-	const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT);
+	return __cfq_find_cfq_hash(cfqd, key, hash_long(key, CFQ_QHASH_SHIFT));
+}
+
+static inline void
+cfq_rehash_cfqq(struct cfq_data *cfqd, struct cfq_queue **cfqq,
+		struct cfq_io_context *cic)
+{
+	unsigned long hashkey = cfq_hash_key(cfqd, current);
+	unsigned long hashval = hash_long(hashkey, CFQ_QHASH_SHIFT);
+	struct cfq_queue *__cfqq;
+	unsigned long flags;
+
+	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
 
-	return __cfq_find_cfq_hash(cfqd, pid, hashval);
+	hlist_del(&(*cfqq)->cfq_hash);
+
+	__cfqq = __cfq_find_cfq_hash(cfqd, hashkey, hashval);
+	if (!__cfqq || __cfqq == *cfqq) {
+		__cfqq = *cfqq;
+		hlist_add_head(&__cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
+		__cfqq->key_type = cfqd->key_type;
+	} else {
+		atomic_inc(&__cfqq->ref);
+		cic->cfqq = __cfqq;
+		cfq_put_queue(*cfqq);
+		*cfqq = __cfqq;
+	}
+
+	cic->cfqq = __cfqq;
+	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 }
 
-static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+static void cfq_free_io_context(struct cfq_io_context *cic)
 {
-	cfqd->busy_queues--;
-	list_del(&cfqq->cfq_list);
-	list_del(&cfqq->cfq_hash);
-	mempool_free(cfqq, cfq_mpool);
+	kmem_cache_free(cfq_ioc_pool, cic);
 }
 
-static struct cfq_queue *__cfq_get_queue(struct cfq_data *cfqd, int pid,
-					 int gfp_mask)
+/*
+ * locking hierarchy is: io_context lock -> queue locks
+ */
+static void cfq_exit_io_context(struct cfq_io_context *cic)
 {
-	const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT);
+	struct cfq_queue *cfqq = cic->cfqq;
+	struct list_head *entry = &cic->list;
+	request_queue_t *q;
+	unsigned long flags;
+
+	/*
+	 * put the reference this task is holding to the various queues
+	 */
+	spin_lock_irqsave(&cic->ioc->lock, flags);
+	while ((entry = cic->list.next) != &cic->list) {
+		struct cfq_io_context *__cic;
+
+		__cic = list_entry(entry, struct cfq_io_context, list);
+		list_del(entry);
+
+		q = __cic->cfqq->cfqd->queue;
+		spin_lock(q->queue_lock);
+		cfq_put_queue(__cic->cfqq);
+		spin_unlock(q->queue_lock);
+	}
+
+	q = cfqq->cfqd->queue;
+	spin_lock(q->queue_lock);
+	cfq_put_queue(cfqq);
+	spin_unlock(q->queue_lock);
+
+	cic->cfqq = NULL;
+	spin_unlock_irqrestore(&cic->ioc->lock, flags);
+}
+
+static struct cfq_io_context *cfq_alloc_io_context(int gfp_flags)
+{
+	struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_flags);
+
+	if (cic) {
+		cic->dtor = cfq_free_io_context;
+		cic->exit = cfq_exit_io_context;
+		INIT_LIST_HEAD(&cic->list);
+		cic->cfqq = NULL;
+	}
+
+	return cic;
+}
+
+/*
+ * Setup general io context and cfq io context. There can be several cfq
+ * io contexts per general io context, if this process is doing io to more
+ * than one device managed by cfq. Note that caller is holding a reference to
+ * cfqq, so we don't need to worry about it disappearing
+ */
+static struct cfq_io_context *
+cfq_get_io_context(struct cfq_queue **cfqq, int gfp_flags)
+{
+	struct cfq_data *cfqd = (*cfqq)->cfqd;
+	struct cfq_queue *__cfqq = *cfqq;
+	struct cfq_io_context *cic;
+	struct io_context *ioc;
+
+	might_sleep_if(gfp_flags & __GFP_WAIT);
+
+	ioc = get_io_context(gfp_flags);
+	if (!ioc)
+		return NULL;
+
+	if ((cic = ioc->cic) == NULL) {
+		cic = cfq_alloc_io_context(gfp_flags);
+
+		if (cic == NULL)
+			goto err;
+
+		ioc->cic = cic;
+		cic->ioc = ioc;
+		cic->cfqq = __cfqq;
+		atomic_inc(&__cfqq->ref);
+	} else {
+		struct cfq_io_context *__cic;
+		unsigned long flags;
+
+		/*
+		 * since the first cic on the list is actually the head
+		 * itself, need to check this here or we'll duplicate an
+		 * cic per ioc for no reason
+		 */
+		if (cic->cfqq == __cfqq)
+			goto out;
+
+		/*
+		 * cic exists, check if we already are there. linear search
+		 * should be ok here, the list will usually not be more than
+		 * 1 or a few entries long
+		 */
+		spin_lock_irqsave(&ioc->lock, flags);
+		list_for_each_entry(__cic, &cic->list, list) {
+			/*
+			 * this process is already holding a reference to
+			 * this queue, so no need to get one more
+			 */
+			if (__cic->cfqq == __cfqq) {
+				cic = __cic;
+				spin_unlock_irqrestore(&ioc->lock, flags);
+				goto out;
+			}
+		}
+		spin_unlock_irqrestore(&ioc->lock, flags);
+
+		/*
+		 * nope, process doesn't have a cic assoicated with this
+		 * cfqq yet. get a new one and add to list
+		 */
+		__cic = cfq_alloc_io_context(gfp_flags);
+		if (__cic == NULL)
+			goto err;
+
+		__cic->ioc = ioc;
+		__cic->cfqq = __cfqq;
+		atomic_inc(&__cfqq->ref);
+		spin_lock_irqsave(&ioc->lock, flags);
+		list_add(&__cic->list, &cic->list);
+		spin_unlock_irqrestore(&ioc->lock, flags);
+
+		cic = __cic;
+		*cfqq = __cfqq;
+	}
+
+out:
+	/*
+	 * if key_type has been changed on the fly, we lazily rehash
+	 * each queue at lookup time
+	 */
+	if ((*cfqq)->key_type != cfqd->key_type)
+		cfq_rehash_cfqq(cfqd, cfqq, cic);
+
+	return cic;
+err:
+	put_io_context(ioc);
+	return NULL;
+}
+
+static struct cfq_queue *
+__cfq_get_queue(struct cfq_data *cfqd, unsigned long key, int gfp_mask)
+{
+	const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
-	request_queue_t *q = cfqd->queue;
 
 retry:
-	cfqq = __cfq_find_cfq_hash(cfqd, pid, hashval);
+	cfqq = __cfq_find_cfq_hash(cfqd, key, hashval);
 
 	if (!cfqq) {
 		if (new_cfqq) {
 			cfqq = new_cfqq;
 			new_cfqq = NULL;
 		} else if (gfp_mask & __GFP_WAIT) {
-			spin_unlock_irq(q->queue_lock);
-			new_cfqq = mempool_alloc(cfq_mpool, gfp_mask);
-			spin_lock_irq(q->queue_lock);
+			spin_unlock_irq(cfqd->queue->queue_lock);
+			new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
+			spin_lock_irq(cfqd->queue->queue_lock);
 			goto retry;
 		} else
-			return NULL;
+			goto out;
+
+		memset(cfqq, 0, sizeof(*cfqq));
 
-		INIT_LIST_HEAD(&cfqq->cfq_hash);
+		INIT_HLIST_NODE(&cfqq->cfq_hash);
 		INIT_LIST_HEAD(&cfqq->cfq_list);
 		RB_CLEAR_ROOT(&cfqq->sort_list);
-
-		cfqq->pid = pid;
-		cfqq->queued[0] = cfqq->queued[1] = 0;
-		list_add(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
+		INIT_LIST_HEAD(&cfqq->fifo[0]);
+		INIT_LIST_HEAD(&cfqq->fifo[1]);
+
+		cfqq->key = key;
+		hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
+		atomic_set(&cfqq->ref, 0);
+		cfqq->cfqd = cfqd;
+		atomic_inc(&cfqd->ref);
+		cfqq->key_type = cfqd->key_type;
+		cfqq->service_start = ~0UL;
 	}
 
 	if (new_cfqq)
-		mempool_free(new_cfqq, cfq_mpool);
+		kmem_cache_free(cfq_pool, new_cfqq);
 
+	atomic_inc(&cfqq->ref);
+out:
+	WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
 	return cfqq;
 }
 
-static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int pid,
-				       int gfp_mask)
+static struct cfq_queue *
+cfq_get_queue(struct cfq_data *cfqd, unsigned long key, int gfp_mask)
 {
 	request_queue_t *q = cfqd->queue;
 	struct cfq_queue *cfqq;
 
 	spin_lock_irq(q->queue_lock);
-	cfqq = __cfq_get_queue(cfqd, pid, gfp_mask);
+	cfqq = __cfq_get_queue(cfqd, key, gfp_mask);
 	spin_unlock_irq(q->queue_lock);
 
 	return cfqq;
@@ -508,40 +1243,30 @@ static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int pid,
 
 static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq)
 {
-	struct cfq_queue *cfqq;
+	crq->is_sync = 0;
+	if (rq_data_dir(crq->request) == READ || current->flags & PF_SYNCWRITE)
+		crq->is_sync = 1;
 
-	cfqq = __cfq_get_queue(cfqd, current->tgid, GFP_ATOMIC);
-	if (cfqq) {
-		cfq_add_crq_rb(cfqd, cfqq, crq);
+	cfq_add_crq_rb(crq);
+	crq->queue_start = jiffies;
 
-		if (list_empty(&cfqq->cfq_list)) {
-			list_add(&cfqq->cfq_list, &cfqd->rr_list);
-			cfqd->busy_queues++;
-		}
-	} else {
-		/*
-		 * should can only happen if the request wasn't allocated
-		 * through blk_alloc_request(), eg stack requests from ide-cd
-		 * (those should be removed) _and_ we are in OOM.
-		 */
-		list_add_tail(&crq->request->queuelist, cfqd->dispatch);
-	}
+	list_add_tail(&crq->request->queuelist, &crq->cfq_queue->fifo[crq->is_sync]);
 }
 
 static void
 cfq_insert_request(request_queue_t *q, struct request *rq, int where)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_rq *crq = RQ_DATA(rq);
 
 	switch (where) {
 		case ELEVATOR_INSERT_BACK:
-			while (cfq_dispatch_requests(q, cfqd))
+			while (cfq_dispatch_requests(q, cfqd->cfq_quantum))
 				;
-			list_add_tail(&rq->queuelist, cfqd->dispatch);
+			list_add_tail(&rq->queuelist, &q->queue_head);
 			break;
 		case ELEVATOR_INSERT_FRONT:
-			list_add(&rq->queuelist, cfqd->dispatch);
+			list_add(&rq->queuelist, &q->queue_head);
 			break;
 		case ELEVATOR_INSERT_SORT:
 			BUG_ON(!blk_fs_request(rq));
@@ -562,12 +1287,27 @@ cfq_insert_request(request_queue_t *q, struct request *rq, int where)
 
 static int cfq_queue_empty(request_queue_t *q)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 
-	if (list_empty(cfqd->dispatch) && list_empty(&cfqd->rr_list))
-		return 1;
+	return list_empty(&q->queue_head) && list_empty(&cfqd->rr_list);
+}
+
+static void cfq_completed_request(request_queue_t *q, struct request *rq)
+{
+	struct cfq_rq *crq = RQ_DATA(rq);
+
+	if (unlikely(!blk_fs_request(rq)))
+		return;
+
+	if (crq->in_flight) {
+		struct cfq_queue *cfqq = crq->cfq_queue;
+
+		WARN_ON(!cfqq->in_flight);
+		cfqq->in_flight--;
+
+		cfq_account_completion(cfqq, crq);
+	}
 
-	return 0;
 }
 
 static struct request *
@@ -596,92 +1336,169 @@ cfq_latter_request(request_queue_t *q, struct request *rq)
 
 static int cfq_may_queue(request_queue_t *q, int rw)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq;
-	int ret = 1;
+	int ret = ELV_MQUEUE_MAY;
 
-	if (!cfqd->busy_queues)
-		goto out;
+	if (current->flags & PF_MEMALLOC)
+		return ELV_MQUEUE_MAY;
 
-	cfqq = cfq_find_cfq_hash(cfqd, current->tgid);
+	cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(cfqd, current));
 	if (cfqq) {
-		int limit = (q->nr_requests - cfqd->cfq_queued) / cfqd->busy_queues;
+		int limit = cfqd->max_queued;
+
+		if (cfqq->allocated[rw] < cfqd->cfq_queued)
+			return ELV_MQUEUE_MUST;
 
-		if (limit < 3)
-			limit = 3;
+		if (cfqd->busy_queues)
+			limit = q->nr_requests / cfqd->busy_queues;
+
+		if (limit < cfqd->cfq_queued)
+			limit = cfqd->cfq_queued;
 		else if (limit > cfqd->max_queued)
 			limit = cfqd->max_queued;
 
-		if (cfqq->queued[rw] > limit)
-			ret = 0;
+		if (cfqq->allocated[rw] >= limit) {
+			if (limit > cfqq->alloc_limit[rw])
+				cfqq->alloc_limit[rw] = limit;
+
+			ret = ELV_MQUEUE_NO;
+		}
 	}
-out:
+
 	return ret;
 }
 
+static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq)
+{
+	struct request_list *rl = &q->rq;
+	const int write = waitqueue_active(&rl->wait[WRITE]);
+	const int read = waitqueue_active(&rl->wait[READ]);
+
+	if (read && cfqq->allocated[READ] < cfqq->alloc_limit[READ])
+		wake_up(&rl->wait[READ]);
+	if (write && cfqq->allocated[WRITE] < cfqq->alloc_limit[WRITE])
+		wake_up(&rl->wait[WRITE]);
+}
+
+/*
+ * queue lock held here
+ */
 static void cfq_put_request(request_queue_t *q, struct request *rq)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_rq *crq = RQ_DATA(rq);
-	struct request_list *rl;
-	int other_rw;
 
 	if (crq) {
+		struct cfq_queue *cfqq = crq->cfq_queue;
+
 		BUG_ON(q->last_merge == rq);
-		BUG_ON(ON_MHASH(crq));
+		BUG_ON(!hlist_unhashed(&crq->hash));
+
+		if (crq->io_context)
+			put_io_context(crq->io_context->ioc);
+
+		if (!cfqq->allocated[crq->is_write]) {
+			WARN_ON(1);
+			cfqq->allocated[crq->is_write] = 1;
+		}
+		cfqq->allocated[crq->is_write]--;
 
 		mempool_free(crq, cfqd->crq_pool);
 		rq->elevator_private = NULL;
-	}
 
-	/*
-	 * work-around for may_queue "bug": if a read gets issued and refused
-	 * to queue because writes ate all the allowed slots and no other
-	 * reads are pending for this queue, it could get stuck infinitely
-	 * since freed_request() only checks the waitqueue for writes when
-	 * freeing them. or vice versa for a single write vs many reads.
-	 * so check here whether "the other" data direction might be able
-	 * to queue and wake them
-	 */
-	rl = &q->rq;
-	other_rw = rq_data_dir(rq) ^ 1;
-	if (rl->count[other_rw] <= q->nr_requests) {
 		smp_mb();
-		if (waitqueue_active(&rl->wait[other_rw]))
-			wake_up(&rl->wait[other_rw]);
+		cfq_check_waiters(q, cfqq);
+		cfq_put_queue(cfqq);
 	}
 }
 
+/*
+ * Allocate cfq data structures associated with this request. A queue and
+ */
 static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 {
-	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
+	struct cfq_io_context *cic;
+	const int rw = rq_data_dir(rq);
 	struct cfq_queue *cfqq;
 	struct cfq_rq *crq;
+	unsigned long flags;
+
+	might_sleep_if(gfp_mask & __GFP_WAIT);
+
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	cfqq = __cfq_get_queue(cfqd, cfq_hash_key(cfqd, current), gfp_mask);
+	if (!cfqq) {
+#if 0
+		cfqq = cfq_get_queue(cfqd, CFQ_KEY_SPARE, gfp_mask);
+		printk("%s: got spare queue\n", current->comm);
+#else
+		goto out_lock;
+#endif
+	}
+
+	if (cfqq->allocated[rw] >= cfqd->max_queued)
+		goto out_lock;
+
+	spin_unlock_irqrestore(q->queue_lock, flags);
 
 	/*
-	 * prepare a queue up front, so cfq_enqueue() doesn't have to
+	 * if hashing type has changed, the cfq_queue might change here. we
+	 * don't bother rechecking ->allocated since it should be a rare
+	 * event
 	 */
-	cfqq = cfq_get_queue(cfqd, current->tgid, gfp_mask);
-	if (!cfqq)
-		return 1;
+	cic = cfq_get_io_context(&cfqq, gfp_mask);
+	if (!cic)
+		goto err;
 
 	crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
 	if (crq) {
-		memset(crq, 0, sizeof(*crq));
 		RB_CLEAR(&crq->rb_node);
+		crq->rb_key = 0;
 		crq->request = rq;
-		crq->cfq_queue = NULL;
-		INIT_LIST_HEAD(&crq->hash);
+		INIT_HLIST_NODE(&crq->hash);
+		crq->cfq_queue = cfqq;
+		crq->io_context = cic;
+		crq->service_start = crq->queue_start = 0;
+		crq->in_flight = crq->accounted = crq->is_sync = 0;
+		crq->is_write = rw;
 		rq->elevator_private = crq;
+		cfqq->allocated[rw]++;
+		cfqq->alloc_limit[rw] = 0;
 		return 0;
 	}
 
+	put_io_context(cic->ioc);
+err:
+	spin_lock_irqsave(q->queue_lock, flags);
+	cfq_put_queue(cfqq);
+out_lock:
+	spin_unlock_irqrestore(q->queue_lock, flags);
 	return 1;
 }
 
-static void cfq_exit(request_queue_t *q, elevator_t *e)
+static void cfq_put_cfqd(struct cfq_data *cfqd)
 {
-	struct cfq_data *cfqd = e->elevator_data;
+	request_queue_t *q = cfqd->queue;
+	elevator_t *e = q->elevator;
+	struct cfq_queue *cfqq;
+
+	if (!atomic_dec_and_test(&cfqd->ref))
+		return;
+
+	/*
+	 * kill spare queue, getting it means we have two refences to it.
+	 * drop both
+	 */
+	spin_lock_irq(q->queue_lock);
+	cfqq = __cfq_get_queue(cfqd, CFQ_KEY_SPARE, GFP_ATOMIC);
+	cfq_put_queue(cfqq);
+	cfq_put_queue(cfqq);
+	spin_unlock_irq(q->queue_lock);
+
+	blk_put_queue(q);
 
 	e->elevator_data = NULL;
 	mempool_destroy(cfqd->crq_pool);
@@ -690,9 +1507,15 @@ static void cfq_exit(request_queue_t *q, elevator_t *e)
 	kfree(cfqd);
 }
 
-static int cfq_init(request_queue_t *q, elevator_t *e)
+static void cfq_exit_queue(elevator_t *e)
+{
+	cfq_put_cfqd(e->elevator_data);
+}
+
+static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 {
 	struct cfq_data *cfqd;
+	struct cfq_queue *cfqq;
 	int i;
 
 	cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
@@ -701,12 +1524,13 @@ static int cfq_init(request_queue_t *q, elevator_t *e)
 
 	memset(cfqd, 0, sizeof(*cfqd));
 	INIT_LIST_HEAD(&cfqd->rr_list);
+	INIT_LIST_HEAD(&cfqd->empty_list);
 
-	cfqd->crq_hash = kmalloc(sizeof(struct list_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
+	cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
 	if (!cfqd->crq_hash)
 		goto out_crqhash;
 
-	cfqd->cfq_hash = kmalloc(sizeof(struct list_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
+	cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
 	if (!cfqd->cfq_hash)
 		goto out_cfqhash;
 
@@ -715,25 +1539,44 @@ static int cfq_init(request_queue_t *q, elevator_t *e)
 		goto out_crqpool;
 
 	for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
-		INIT_LIST_HEAD(&cfqd->crq_hash[i]);
+		INIT_HLIST_HEAD(&cfqd->crq_hash[i]);
 	for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
-		INIT_LIST_HEAD(&cfqd->cfq_hash[i]);
+		INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
 
-	cfqd->dispatch = &q->queue_head;
 	e->elevator_data = cfqd;
+
 	cfqd->queue = q;
+	atomic_inc(&q->refcnt);
+
+	/*
+	 * setup spare failure queue
+	 */
+	cfqq = cfq_get_queue(cfqd, CFQ_KEY_SPARE, GFP_KERNEL);
+	if (!cfqq)
+		goto out_spare;
 
 	/*
 	 * just set it to some high value, we want anyone to be able to queue
 	 * some requests. fairness is handled differently
 	 */
-	cfqd->max_queued = q->nr_requests;
-	q->nr_requests = 8192;
+	q->nr_requests = 1024;
+	cfqd->max_queued = q->nr_requests / 16;
+	q->nr_batching = cfq_queued;
+	cfqd->key_type = CFQ_KEY_TGID;
+	cfqd->find_best_crq = 1;
+	atomic_set(&cfqd->ref, 1);
 
 	cfqd->cfq_queued = cfq_queued;
 	cfqd->cfq_quantum = cfq_quantum;
+	cfqd->cfq_fifo_expire_r = cfq_fifo_expire_r;
+	cfqd->cfq_fifo_expire_w = cfq_fifo_expire_w;
+	cfqd->cfq_fifo_batch_expire = cfq_fifo_rate;
+	cfqd->cfq_back_max = cfq_back_max;
+	cfqd->cfq_back_penalty = cfq_back_penalty;
 
 	return 0;
+out_spare:
+	mempool_destroy(cfqd->crq_pool);
 out_crqpool:
 	kfree(cfqd->cfq_hash);
 out_cfqhash:
@@ -743,29 +1586,39 @@ out_crqhash:
 	return -ENOMEM;
 }
 
+static void cfq_slab_kill(void)
+{
+	if (crq_pool)
+		kmem_cache_destroy(crq_pool);
+	if (cfq_pool)
+		kmem_cache_destroy(cfq_pool);
+	if (cfq_ioc_pool)
+		kmem_cache_destroy(cfq_ioc_pool);
+}
+
 static int __init cfq_slab_setup(void)
 {
 	crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
 					NULL, NULL);
-
 	if (!crq_pool)
-		panic("cfq_iosched: can't init crq pool\n");
+		goto fail;
 
 	cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
 					NULL, NULL);
-
 	if (!cfq_pool)
-		panic("cfq_iosched: can't init cfq pool\n");
+		goto fail;
 
-	cfq_mpool = mempool_create(64, mempool_alloc_slab, mempool_free_slab, cfq_pool);
-
-	if (!cfq_mpool)
-		panic("cfq_iosched: can't init cfq mpool\n");
+	cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool",
+			sizeof(struct cfq_io_context), 0, 0, NULL, NULL);
+	if (!cfq_ioc_pool)
+		goto fail;
 
 	return 0;
+fail:
+	cfq_slab_kill();
+	return -ENOMEM;
 }
 
-subsys_initcall(cfq_slab_setup);
 
 /*
  * sysfs parts below -->
@@ -791,27 +1644,135 @@ cfq_var_store(unsigned int *var, const char *page, size_t count)
 	return count;
 }
 
-#define SHOW_FUNCTION(__FUNC, __VAR)					\
+static ssize_t
+cfq_clear_elapsed(struct cfq_data *cfqd, const char *page, size_t count)
+{
+	max_elapsed_dispatch = max_elapsed_crq = 0;
+	return count;
+}
+
+static ssize_t
+cfq_set_key_type(struct cfq_data *cfqd, const char *page, size_t count)
+{
+	spin_lock_irq(cfqd->queue->queue_lock);
+	if (!strncmp(page, "pgid", 4))
+		cfqd->key_type = CFQ_KEY_PGID;
+	else if (!strncmp(page, "tgid", 4))
+		cfqd->key_type = CFQ_KEY_TGID;
+	else if (!strncmp(page, "uid", 3))
+		cfqd->key_type = CFQ_KEY_UID;
+	else if (!strncmp(page, "gid", 3))
+		cfqd->key_type = CFQ_KEY_GID;
+	spin_unlock_irq(cfqd->queue->queue_lock);
+	return count;
+}
+
+static ssize_t
+cfq_read_key_type(struct cfq_data *cfqd, char *page)
+{
+	ssize_t len = 0;
+	int i;
+
+	for (i = CFQ_KEY_PGID; i < CFQ_KEY_LAST; i++) {
+		if (cfqd->key_type == i)
+			len += sprintf(page+len, "[%s] ", cfq_key_types[i]);
+		else
+			len += sprintf(page+len, "%s ", cfq_key_types[i]);
+	}
+	len += sprintf(page+len, "\n");
+	return len;
+}
+
+static ssize_t
+cfq_status_show(struct cfq_data *cfqd, char *page)
+{
+	struct list_head *entry;
+	struct cfq_queue *cfqq;
+	ssize_t len;
+	int i = 0, queues;
+
+	len = sprintf(page, "Busy queues: %u\n", cfqd->busy_queues);
+	len += sprintf(page+len, "key type: %s\n",
+				cfq_key_types[cfqd->key_type]);
+	len += sprintf(page+len, "last sector: %Lu\n",
+				(unsigned long long)cfqd->last_sector);
+	len += sprintf(page+len, "max time in iosched: %lu\n",
+				max_elapsed_dispatch);
+	len += sprintf(page+len, "max completion time: %lu\n", max_elapsed_crq);
+
+	len += sprintf(page+len, "Busy queue list:\n");
+	spin_lock_irq(cfqd->queue->queue_lock);
+	list_for_each(entry, &cfqd->rr_list) {
+		i++;
+		cfqq = list_entry_cfqq(entry);
+		len += sprintf(page+len, "  cfqq: key=%lu alloc=%d/%d, "
+			"queued=%d/%d, last_fifo=%lu, service_used=%lu\n",
+			cfqq->key, cfqq->allocated[0], cfqq->allocated[1],
+			cfqq->queued[0], cfqq->queued[1],
+			cfqq->last_fifo_expire, cfqq->service_used);
+	}
+	len += sprintf(page+len, "  busy queues total: %d\n", i);
+	queues = i;
+
+	len += sprintf(page+len, "Empty queue list:\n");
+	i = 0;
+	list_for_each(entry, &cfqd->empty_list) {
+		i++;
+		cfqq = list_entry_cfqq(entry);
+		len += sprintf(page+len, "  cfqq: key=%lu alloc=%d/%d, "
+			"queued=%d/%d, last_fifo=%lu, service_used=%lu\n",
+			cfqq->key, cfqq->allocated[0], cfqq->allocated[1],
+			cfqq->queued[0], cfqq->queued[1],
+			cfqq->last_fifo_expire, cfqq->service_used);
+	}
+	len += sprintf(page+len, "  empty queues total: %d\n", i);
+	queues += i;
+	len += sprintf(page+len, "Total queues: %d\n", queues);
+	spin_unlock_irq(cfqd->queue->queue_lock);
+	return len;
+}
+
+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
 static ssize_t __FUNC(struct cfq_data *cfqd, char *page)		\
 {									\
-	return cfq_var_show(__VAR, (page));				\
+	unsigned int __data = __VAR;					\
+	if (__CONV)							\
+		__data = jiffies_to_msecs(__data);			\
+	return cfq_var_show(__data, (page));				\
 }
-SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum);
-SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued);
+SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
+SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
+SHOW_FUNCTION(cfq_fifo_expire_r_show, cfqd->cfq_fifo_expire_r, 1);
+SHOW_FUNCTION(cfq_fifo_expire_w_show, cfqd->cfq_fifo_expire_w, 1);
+SHOW_FUNCTION(cfq_fifo_batch_expire_show, cfqd->cfq_fifo_batch_expire, 1);
+SHOW_FUNCTION(cfq_find_best_show, cfqd->find_best_crq, 0);
+SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max, 0);
+SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty, 0);
 #undef SHOW_FUNCTION
 
-#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)				\
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
 static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count)	\
 {									\
-	int ret = cfq_var_store(__PTR, (page), count);			\
-	if (*(__PTR) < (MIN))						\
-		*(__PTR) = (MIN);					\
-	else if (*(__PTR) > (MAX))					\
-		*(__PTR) = (MAX);					\
+	unsigned int __data;						\
+	int ret = cfq_var_store(&__data, (page), count);		\
+	if (__data < (MIN))						\
+		__data = (MIN);						\
+	else if (__data > (MAX))					\
+		__data = (MAX);						\
+	if (__CONV)							\
+		*(__PTR) = msecs_to_jiffies(__data);			\
+	else								\
+		*(__PTR) = __data;					\
 	return ret;							\
 }
-STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, INT_MAX);
-STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, INT_MAX);
+STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
+STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
+STORE_FUNCTION(cfq_fifo_expire_r_store, &cfqd->cfq_fifo_expire_r, 1, UINT_MAX, 1);
+STORE_FUNCTION(cfq_fifo_expire_w_store, &cfqd->cfq_fifo_expire_w, 1, UINT_MAX, 1);
+STORE_FUNCTION(cfq_fifo_batch_expire_store, &cfqd->cfq_fifo_batch_expire, 0, UINT_MAX, 1);
+STORE_FUNCTION(cfq_find_best_store, &cfqd->find_best_crq, 0, 1, 0);
+STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
+STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
 #undef STORE_FUNCTION
 
 static struct cfq_fs_entry cfq_quantum_entry = {
@@ -824,10 +1785,62 @@ static struct cfq_fs_entry cfq_queued_entry = {
 	.show = cfq_queued_show,
 	.store = cfq_queued_store,
 };
+static struct cfq_fs_entry cfq_fifo_expire_r_entry = {
+	.attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_fifo_expire_r_show,
+	.store = cfq_fifo_expire_r_store,
+};
+static struct cfq_fs_entry cfq_fifo_expire_w_entry = {
+	.attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_fifo_expire_w_show,
+	.store = cfq_fifo_expire_w_store,
+};
+static struct cfq_fs_entry cfq_fifo_batch_expire_entry = {
+	.attr = {.name = "fifo_batch_expire", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_fifo_batch_expire_show,
+	.store = cfq_fifo_batch_expire_store,
+};
+static struct cfq_fs_entry cfq_find_best_entry = {
+	.attr = {.name = "find_best_crq", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_find_best_show,
+	.store = cfq_find_best_store,
+};
+static struct cfq_fs_entry cfq_back_max_entry = {
+	.attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_back_max_show,
+	.store = cfq_back_max_store,
+};
+static struct cfq_fs_entry cfq_back_penalty_entry = {
+	.attr = {.name = "back_seek_penalty", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_back_penalty_show,
+	.store = cfq_back_penalty_store,
+};
+static struct cfq_fs_entry cfq_clear_elapsed_entry = {
+	.attr = {.name = "clear_elapsed", .mode = S_IWUSR },
+	.store = cfq_clear_elapsed,
+};
+static struct cfq_fs_entry cfq_misc_entry = {
+	.attr = {.name = "show_status", .mode = S_IRUGO },
+	.show = cfq_status_show,
+};
+static struct cfq_fs_entry cfq_key_type_entry = {
+	.attr = {.name = "key_type", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_read_key_type,
+	.store = cfq_set_key_type,
+};
 
 static struct attribute *default_attrs[] = {
 	&cfq_quantum_entry.attr,
 	&cfq_queued_entry.attr,
+	&cfq_fifo_expire_r_entry.attr,
+	&cfq_fifo_expire_w_entry.attr,
+	&cfq_fifo_batch_expire_entry.attr,
+	&cfq_key_type_entry.attr,
+	&cfq_find_best_entry.attr,
+	&cfq_back_max_entry.attr,
+	&cfq_back_penalty_entry.attr,
+	&cfq_clear_elapsed_entry.attr,
+	&cfq_misc_entry.attr,
 	NULL,
 };
 
@@ -868,23 +1881,56 @@ struct kobj_type cfq_ktype = {
 	.default_attrs	= default_attrs,
 };
 
-elevator_t iosched_cfq = {
-	.elevator_name =		"cfq",
-	.elevator_ktype =		&cfq_ktype,
-	.elevator_merge_fn = 		cfq_merge,
-	.elevator_merged_fn =		cfq_merged_request,
-	.elevator_merge_req_fn =	cfq_merged_requests,
-	.elevator_next_req_fn =		cfq_next_request,
-	.elevator_add_req_fn =		cfq_insert_request,
-	.elevator_remove_req_fn =	cfq_remove_request,
-	.elevator_queue_empty_fn =	cfq_queue_empty,
-	.elevator_former_req_fn =	cfq_former_request,
-	.elevator_latter_req_fn =	cfq_latter_request,
-	.elevator_set_req_fn =		cfq_set_request,
-	.elevator_put_req_fn =		cfq_put_request,
-	.elevator_may_queue_fn =	cfq_may_queue,
-	.elevator_init_fn =		cfq_init,
-	.elevator_exit_fn =		cfq_exit,
+static struct elevator_type iosched_cfq = {
+	.ops = {
+		.elevator_merge_fn = 		cfq_merge,
+		.elevator_merged_fn =		cfq_merged_request,
+		.elevator_merge_req_fn =	cfq_merged_requests,
+		.elevator_next_req_fn =		cfq_next_request,
+		.elevator_add_req_fn =		cfq_insert_request,
+		.elevator_remove_req_fn =	cfq_remove_request,
+		.elevator_requeue_req_fn =	cfq_requeue_request,
+		.elevator_queue_empty_fn =	cfq_queue_empty,
+		.elevator_completed_req_fn =	cfq_completed_request,
+		.elevator_former_req_fn =	cfq_former_request,
+		.elevator_latter_req_fn =	cfq_latter_request,
+		.elevator_set_req_fn =		cfq_set_request,
+		.elevator_put_req_fn =		cfq_put_request,
+		.elevator_may_queue_fn =	cfq_may_queue,
+		.elevator_init_fn =		cfq_init_queue,
+		.elevator_exit_fn =		cfq_exit_queue,
+	},
+	.elevator_ktype =	&cfq_ktype,
+	.elevator_name =	"cfq",
+	.elevator_owner =	THIS_MODULE,
 };
 
-EXPORT_SYMBOL(iosched_cfq);
+int cfq_init(void)
+{
+	int ret;
+
+	if (cfq_slab_setup())
+		return -ENOMEM;
+
+	ret = elv_register(&iosched_cfq);
+	if (!ret) {
+		__module_get(THIS_MODULE);
+		return 0;
+	}
+
+	cfq_slab_kill();
+	return ret;
+}
+
+void cfq_exit(void)
+{
+	cfq_slab_kill();
+	elv_unregister(&iosched_cfq);
+}
+
+module_init(cfq_init);
+module_exit(cfq_exit);
+
+MODULE_AUTHOR("Jens Axboe");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 204b3182900d..dc896a12283b 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -21,7 +21,6 @@
  */
 #include <linux/config.h>	/* CONFIG_PROC_FS */
 #include <linux/module.h>
-#include <linux/version.h>
 #include <linux/types.h>
 #include <linux/pci.h>
 #include <linux/bio.h>
@@ -732,7 +731,6 @@ static void __iomem *remap_pci_mem(ulong base, ulong size)
 }
 
 #ifndef MODULE
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,13)
 /*
  * Config string is a comma separated set of i/o addresses of EISA cards.
  */
@@ -749,18 +747,6 @@ static int cpqarray_setup(char *str)
 
 __setup("smart2=", cpqarray_setup);
 
-#else
-
-/*
- * Copy the contents of the ints[] array passed to us by init.
- */
-void cpqarray_setup(char *str, int *ints)
-{
-	int i;
-	for(i=0; i<ints[0] && i<8; i++)
-		eisa[i] = ints[i+1];
-}
-#endif
 #endif
 
 /*
diff --git a/drivers/block/deadline-iosched.c b/drivers/block/deadline-iosched.c
index fb7ab733c709..f482e8bdb4d6 100644
--- a/drivers/block/deadline-iosched.c
+++ b/drivers/block/deadline-iosched.c
@@ -289,7 +289,7 @@ deadline_find_first_drq(struct deadline_data *dd, int data_dir)
 static inline void
 deadline_add_request(struct request_queue *q, struct request *rq)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	struct deadline_rq *drq = RQ_DATA(rq);
 
 	const int data_dir = rq_data_dir(drq->request);
@@ -317,7 +317,7 @@ static void deadline_remove_request(request_queue_t *q, struct request *rq)
 	struct deadline_rq *drq = RQ_DATA(rq);
 
 	if (drq) {
-		struct deadline_data *dd = q->elevator.elevator_data;
+		struct deadline_data *dd = q->elevator->elevator_data;
 
 		list_del_init(&drq->fifo);
 		deadline_remove_merge_hints(q, drq);
@@ -328,7 +328,7 @@ static void deadline_remove_request(request_queue_t *q, struct request *rq)
 static int
 deadline_merge(request_queue_t *q, struct request **req, struct bio *bio)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	struct request *__rq;
 	int ret;
 
@@ -383,7 +383,7 @@ out_insert:
 
 static void deadline_merged_request(request_queue_t *q, struct request *req)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	struct deadline_rq *drq = RQ_DATA(req);
 
 	/*
@@ -407,7 +407,7 @@ static void
 deadline_merged_requests(request_queue_t *q, struct request *req,
 			 struct request *next)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	struct deadline_rq *drq = RQ_DATA(req);
 	struct deadline_rq *dnext = RQ_DATA(next);
 
@@ -604,7 +604,7 @@ dispatch_request:
 
 static struct request *deadline_next_request(request_queue_t *q)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	struct request *rq;
 
 	/*
@@ -625,7 +625,7 @@ dispatch:
 static void
 deadline_insert_request(request_queue_t *q, struct request *rq, int where)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 
 	/* barriers must flush the reorder queue */
 	if (unlikely(rq->flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)
@@ -653,7 +653,7 @@ deadline_insert_request(request_queue_t *q, struct request *rq, int where)
 
 static int deadline_queue_empty(request_queue_t *q)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 
 	if (!list_empty(&dd->fifo_list[WRITE])
 	    || !list_empty(&dd->fifo_list[READ])
@@ -687,7 +687,7 @@ deadline_latter_request(request_queue_t *q, struct request *rq)
 	return NULL;
 }
 
-static void deadline_exit(request_queue_t *q, elevator_t *e)
+static void deadline_exit_queue(elevator_t *e)
 {
 	struct deadline_data *dd = e->elevator_data;
 
@@ -703,7 +703,7 @@ static void deadline_exit(request_queue_t *q, elevator_t *e)
  * initialize elevator private data (deadline_data), and alloc a drq for
  * each request on the free lists
  */
-static int deadline_init(request_queue_t *q, elevator_t *e)
+static int deadline_init_queue(request_queue_t *q, elevator_t *e)
 {
 	struct deadline_data *dd;
 	int i;
@@ -748,7 +748,7 @@ static int deadline_init(request_queue_t *q, elevator_t *e)
 
 static void deadline_put_request(request_queue_t *q, struct request *rq)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	struct deadline_rq *drq = RQ_DATA(rq);
 
 	if (drq) {
@@ -760,7 +760,7 @@ static void deadline_put_request(request_queue_t *q, struct request *rq)
 static int
 deadline_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	struct deadline_rq *drq;
 
 	drq = mempool_alloc(dd->drq_pool, gfp_mask);
@@ -805,33 +805,41 @@ deadline_var_store(unsigned int *var, const char *page, size_t count)
 	return count;
 }
 
-#define SHOW_FUNCTION(__FUNC, __VAR)					\
+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
 static ssize_t __FUNC(struct deadline_data *dd, char *page)		\
 {									\
-	return deadline_var_show(__VAR, (page));			\
-}
-SHOW_FUNCTION(deadline_readexpire_show, dd->fifo_expire[READ]);
-SHOW_FUNCTION(deadline_writeexpire_show, dd->fifo_expire[WRITE]);
-SHOW_FUNCTION(deadline_writesstarved_show, dd->writes_starved);
-SHOW_FUNCTION(deadline_frontmerges_show, dd->front_merges);
-SHOW_FUNCTION(deadline_fifobatch_show, dd->fifo_batch);
+	unsigned int __data = __VAR;					\
+	if (__CONV)							\
+		__data = jiffies_to_msecs(__data);			\
+	return deadline_var_show(__data, (page));			\
+}
+SHOW_FUNCTION(deadline_readexpire_show, dd->fifo_expire[READ], 1);
+SHOW_FUNCTION(deadline_writeexpire_show, dd->fifo_expire[WRITE], 1);
+SHOW_FUNCTION(deadline_writesstarved_show, dd->writes_starved, 0);
+SHOW_FUNCTION(deadline_frontmerges_show, dd->front_merges, 0);
+SHOW_FUNCTION(deadline_fifobatch_show, dd->fifo_batch, 0);
 #undef SHOW_FUNCTION
 
-#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)				\
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
 static ssize_t __FUNC(struct deadline_data *dd, const char *page, size_t count)	\
 {									\
-	int ret = deadline_var_store(__PTR, (page), count);		\
-	if (*(__PTR) < (MIN))						\
-		*(__PTR) = (MIN);					\
-	else if (*(__PTR) > (MAX))					\
-		*(__PTR) = (MAX);					\
+	unsigned int __data;						\
+	int ret = deadline_var_store(&__data, (page), count);		\
+	if (__data < (MIN))						\
+		__data = (MIN);						\
+	else if (__data > (MAX))					\
+		__data = (MAX);						\
+	if (__CONV)							\
+		*(__PTR) = msecs_to_jiffies(__data);			\
+	else								\
+		*(__PTR) = __data;					\
 	return ret;							\
 }
-STORE_FUNCTION(deadline_readexpire_store, &dd->fifo_expire[READ], 0, INT_MAX);
-STORE_FUNCTION(deadline_writeexpire_store, &dd->fifo_expire[WRITE], 0, INT_MAX);
-STORE_FUNCTION(deadline_writesstarved_store, &dd->writes_starved, INT_MIN, INT_MAX);
-STORE_FUNCTION(deadline_frontmerges_store, &dd->front_merges, 0, 1);
-STORE_FUNCTION(deadline_fifobatch_store, &dd->fifo_batch, 0, INT_MAX);
+STORE_FUNCTION(deadline_readexpire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_writeexpire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_writesstarved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
+STORE_FUNCTION(deadline_frontmerges_store, &dd->front_merges, 0, 1, 0);
+STORE_FUNCTION(deadline_fifobatch_store, &dd->fifo_batch, 0, INT_MAX, 0);
 #undef STORE_FUNCTION
 
 static struct deadline_fs_entry deadline_readexpire_entry = {
@@ -906,36 +914,54 @@ struct kobj_type deadline_ktype = {
 	.default_attrs	= default_attrs,
 };
 
-static int __init deadline_slab_setup(void)
+static struct elevator_type iosched_deadline = {
+	.ops = {
+		.elevator_merge_fn = 		deadline_merge,
+		.elevator_merged_fn =		deadline_merged_request,
+		.elevator_merge_req_fn =	deadline_merged_requests,
+		.elevator_next_req_fn =		deadline_next_request,
+		.elevator_add_req_fn =		deadline_insert_request,
+		.elevator_remove_req_fn =	deadline_remove_request,
+		.elevator_queue_empty_fn =	deadline_queue_empty,
+		.elevator_former_req_fn =	deadline_former_request,
+		.elevator_latter_req_fn =	deadline_latter_request,
+		.elevator_set_req_fn =		deadline_set_request,
+		.elevator_put_req_fn = 		deadline_put_request,
+		.elevator_init_fn =		deadline_init_queue,
+		.elevator_exit_fn =		deadline_exit_queue,
+	},
+
+	.elevator_ktype = &deadline_ktype,
+	.elevator_name = "deadline",
+	.elevator_owner = THIS_MODULE,
+};
+
+int deadline_init(void)
 {
+	int ret;
+
 	drq_pool = kmem_cache_create("deadline_drq", sizeof(struct deadline_rq),
 				     0, 0, NULL, NULL);
 
 	if (!drq_pool)
-		panic("deadline: can't init slab pool\n");
+		return -ENOMEM;
 
-	return 0;
+	ret = elv_register(&iosched_deadline);
+	if (ret)
+		kmem_cache_destroy(drq_pool);
+
+	return ret;
 }
 
-subsys_initcall(deadline_slab_setup);
-
-elevator_t iosched_deadline = {
-	.elevator_merge_fn = 		deadline_merge,
-	.elevator_merged_fn =		deadline_merged_request,
-	.elevator_merge_req_fn =	deadline_merged_requests,
-	.elevator_next_req_fn =		deadline_next_request,
-	.elevator_add_req_fn =		deadline_insert_request,
-	.elevator_remove_req_fn =	deadline_remove_request,
-	.elevator_queue_empty_fn =	deadline_queue_empty,
-	.elevator_former_req_fn =	deadline_former_request,
-	.elevator_latter_req_fn =	deadline_latter_request,
-	.elevator_set_req_fn =		deadline_set_request,
-	.elevator_put_req_fn = 		deadline_put_request,
-	.elevator_init_fn =		deadline_init,
-	.elevator_exit_fn =		deadline_exit,
-
-	.elevator_ktype =		&deadline_ktype,
-	.elevator_name =		"deadline",
-};
+void deadline_exit(void)
+{
+	kmem_cache_destroy(drq_pool);
+	elv_unregister(&iosched_deadline);
+}
+
+module_init(deadline_init);
+module_exit(deadline_exit);
 
-EXPORT_SYMBOL(iosched_deadline);
+MODULE_AUTHOR("Jens Axboe");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("deadline IO scheduler");
diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c
index 35c9385ac133..1b4f6a70c0ca 100644
--- a/drivers/block/elevator.c
+++ b/drivers/block/elevator.c
@@ -37,6 +37,9 @@
 
 #include <asm/uaccess.h>
 
+static spinlock_t elv_list_lock = SPIN_LOCK_UNLOCKED;
+static LIST_HEAD(elv_list);
+
 /*
  * can we safely merge with this request?
  */
@@ -60,6 +63,7 @@ inline int elv_rq_merge_ok(struct request *rq, struct bio *bio)
 
 	return 0;
 }
+EXPORT_SYMBOL(elv_rq_merge_ok);
 
 inline int elv_try_merge(struct request *__rq, struct bio *bio)
 {
@@ -77,6 +81,7 @@ inline int elv_try_merge(struct request *__rq, struct bio *bio)
 
 	return ret;
 }
+EXPORT_SYMBOL(elv_try_merge);
 
 inline int elv_try_last_merge(request_queue_t *q, struct bio *bio)
 {
@@ -85,31 +90,117 @@ inline int elv_try_last_merge(request_queue_t *q, struct bio *bio)
 
 	return ELEVATOR_NO_MERGE;
 }
+EXPORT_SYMBOL(elv_try_last_merge);
 
-/*
- * general block -> elevator interface starts here
- */
-int elevator_init(request_queue_t *q, elevator_t *type)
+struct elevator_type *elevator_find(const char *name)
+{
+	struct elevator_type *e = NULL;
+	struct list_head *entry;
+
+	spin_lock_irq(&elv_list_lock);
+	list_for_each(entry, &elv_list) {
+		struct elevator_type *__e;
+
+		__e = list_entry(entry, struct elevator_type, list);
+
+		if (!strcmp(__e->elevator_name, name)) {
+			e = __e;
+			break;
+		}
+	}
+	spin_unlock_irq(&elv_list_lock);
+
+	return e;
+}
+
+static int elevator_attach(request_queue_t *q, struct elevator_type *e,
+			   struct elevator_queue *eq)
 {
-	elevator_t *e = &q->elevator;
+	int ret = 0;
 
-	memcpy(e, type, sizeof(*e));
+	if (!try_module_get(e->elevator_owner))
+		return -EINVAL;
+
+	memset(eq, 0, sizeof(*eq));
+	eq->ops = &e->ops;
+	eq->elevator_type = e;
 
 	INIT_LIST_HEAD(&q->queue_head);
 	q->last_merge = NULL;
+	q->elevator = eq;
+
+	if (eq->ops->elevator_init_fn)
+		ret = eq->ops->elevator_init_fn(q, eq);
 
-	if (e->elevator_init_fn)
-		return e->elevator_init_fn(q, e);
+	return ret;
+}
+
+static char chosen_elevator[16];
+
+static void elevator_setup_default(void)
+{
+	/*
+	 * check if default is set and exists
+	 */
+	if (chosen_elevator[0] && elevator_find(chosen_elevator))
+		return;
+
+#if defined(CONFIG_IOSCHED_AS)
+	strcpy(chosen_elevator, "anticipatory");
+#elif defined(CONFIG_IOSCHED_DEADLINE)
+	strcpy(chosen_elevator, "deadline");
+#elif defined(CONFIG_IOSCHED_CFQ)
+	strcpy(chosen_elevator, "cfq");
+#elif defined(CONFIG_IOSCHED_NOOP)
+	strcpy(chosen_elevator, "noop");
+#else
+#error "You must build at least 1 IO scheduler into the kernel"
+#endif
+	printk("elevator: using %s as default io scheduler\n", chosen_elevator);
+}
 
+static int __init elevator_setup(char *str)
+{
+	strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
 	return 0;
 }
 
-void elevator_exit(request_queue_t *q)
+__setup("elevator=", elevator_setup);
+
+int elevator_init(request_queue_t *q, char *name)
+{
+	struct elevator_type *e = NULL;
+	struct elevator_queue *eq;
+	int ret = 0;
+
+	elevator_setup_default();
+
+	if (!name)
+		name = chosen_elevator;
+
+	e = elevator_find(name);
+	if (!e)
+		return -EINVAL;
+
+	eq = kmalloc(sizeof(struct elevator_queue), GFP_KERNEL);
+	if (!eq)
+		return -ENOMEM;
+
+	ret = elevator_attach(q, e, eq);
+	if (ret)
+		kfree(eq);
+
+	return ret;
+}
+
+void elevator_exit(elevator_t *e)
 {
-	elevator_t *e = &q->elevator;
+	if (e->ops->elevator_exit_fn)
+		e->ops->elevator_exit_fn(e);
 
-	if (e->elevator_exit_fn)
-		e->elevator_exit_fn(q, e);
+	module_put(e->elevator_type->elevator_owner);
+	e->elevator_type = NULL;
+	kfree(e);
 }
 
 int elevator_global_init(void)
@@ -119,32 +210,32 @@ int elevator_global_init(void)
 
 int elv_merge(request_queue_t *q, struct request **req, struct bio *bio)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
-	if (e->elevator_merge_fn)
-		return e->elevator_merge_fn(q, req, bio);
+	if (e->ops->elevator_merge_fn)
+		return e->ops->elevator_merge_fn(q, req, bio);
 
 	return ELEVATOR_NO_MERGE;
 }
 
 void elv_merged_request(request_queue_t *q, struct request *rq)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
-	if (e->elevator_merged_fn)
-		e->elevator_merged_fn(q, rq);
+	if (e->ops->elevator_merged_fn)
+		e->ops->elevator_merged_fn(q, rq);
 }
 
 void elv_merge_requests(request_queue_t *q, struct request *rq,
 			     struct request *next)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
 	if (q->last_merge == next)
 		q->last_merge = NULL;
 
-	if (e->elevator_merge_req_fn)
-		e->elevator_merge_req_fn(q, rq, next);
+	if (e->ops->elevator_merge_req_fn)
+		e->ops->elevator_merge_req_fn(q, rq, next);
 }
 
 void elv_requeue_request(request_queue_t *q, struct request *rq)
@@ -160,8 +251,8 @@ void elv_requeue_request(request_queue_t *q, struct request *rq)
 	 * if iosched has an explicit requeue hook, then use that. otherwise
 	 * just put the request at the front of the queue
 	 */
-	if (q->elevator.elevator_requeue_req_fn)
-		q->elevator.elevator_requeue_req_fn(q, rq);
+	if (q->elevator->ops->elevator_requeue_req_fn)
+		q->elevator->ops->elevator_requeue_req_fn(q, rq);
 	else
 		__elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0);
 }
@@ -180,7 +271,7 @@ void __elv_add_request(request_queue_t *q, struct request *rq, int where,
 		blk_plug_device(q);
 
 	rq->q = q;
-	q->elevator.elevator_add_req_fn(q, rq, where);
+	q->elevator->ops->elevator_add_req_fn(q, rq, where);
 
 	if (blk_queue_plugged(q)) {
 		int nrq = q->rq.count[READ] + q->rq.count[WRITE] - q->in_flight;
@@ -203,7 +294,7 @@ void elv_add_request(request_queue_t *q, struct request *rq, int where,
 
 static inline struct request *__elv_next_request(request_queue_t *q)
 {
-	return q->elevator.elevator_next_req_fn(q);
+	return q->elevator->ops->elevator_next_req_fn(q);
 }
 
 struct request *elv_next_request(request_queue_t *q)
@@ -252,7 +343,7 @@ struct request *elv_next_request(request_queue_t *q)
 
 void elv_remove_request(request_queue_t *q, struct request *rq)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
 	/*
 	 * the time frame between a request being removed from the lists
@@ -274,16 +365,16 @@ void elv_remove_request(request_queue_t *q, struct request *rq)
 	if (rq == q->last_merge)
 		q->last_merge = NULL;
 
-	if (e->elevator_remove_req_fn)
-		e->elevator_remove_req_fn(q, rq);
+	if (e->ops->elevator_remove_req_fn)
+		e->ops->elevator_remove_req_fn(q, rq);
 }
 
 int elv_queue_empty(request_queue_t *q)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
-	if (e->elevator_queue_empty_fn)
-		return e->elevator_queue_empty_fn(q);
+	if (e->ops->elevator_queue_empty_fn)
+		return e->ops->elevator_queue_empty_fn(q);
 
 	return list_empty(&q->queue_head);
 }
@@ -292,10 +383,10 @@ struct request *elv_latter_request(request_queue_t *q, struct request *rq)
 {
 	struct list_head *next;
 
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
-	if (e->elevator_latter_req_fn)
-		return e->elevator_latter_req_fn(q, rq);
+	if (e->ops->elevator_latter_req_fn)
+		return e->ops->elevator_latter_req_fn(q, rq);
 
 	next = rq->queuelist.next;
 	if (next != &q->queue_head && next != &rq->queuelist)
@@ -308,10 +399,10 @@ struct request *elv_former_request(request_queue_t *q, struct request *rq)
 {
 	struct list_head *prev;
 
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
-	if (e->elevator_former_req_fn)
-		return e->elevator_former_req_fn(q, rq);
+	if (e->ops->elevator_former_req_fn)
+		return e->ops->elevator_former_req_fn(q, rq);
 
 	prev = rq->queuelist.prev;
 	if (prev != &q->queue_head && prev != &rq->queuelist)
@@ -322,10 +413,10 @@ struct request *elv_former_request(request_queue_t *q, struct request *rq)
 
 int elv_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
-	if (e->elevator_set_req_fn)
-		return e->elevator_set_req_fn(q, rq, gfp_mask);
+	if (e->ops->elevator_set_req_fn)
+		return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
 
 	rq->elevator_private = NULL;
 	return 0;
@@ -333,25 +424,25 @@ int elv_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 
 void elv_put_request(request_queue_t *q, struct request *rq)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
-	if (e->elevator_put_req_fn)
-		e->elevator_put_req_fn(q, rq);
+	if (e->ops->elevator_put_req_fn)
+		e->ops->elevator_put_req_fn(q, rq);
 }
 
 int elv_may_queue(request_queue_t *q, int rw)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
-	if (e->elevator_may_queue_fn)
-		return e->elevator_may_queue_fn(q, rw);
+	if (e->ops->elevator_may_queue_fn)
+		return e->ops->elevator_may_queue_fn(q, rw);
 
-	return 0;
+	return ELV_MQUEUE_MAY;
 }
 
 void elv_completed_request(request_queue_t *q, struct request *rq)
 {
-	elevator_t *e = &q->elevator;
+	elevator_t *e = q->elevator;
 
 	/*
 	 * request is released from the driver, io must be done
@@ -359,22 +450,20 @@ void elv_completed_request(request_queue_t *q, struct request *rq)
 	if (blk_account_rq(rq))
 		q->in_flight--;
 
-	if (e->elevator_completed_req_fn)
-		e->elevator_completed_req_fn(q, rq);
+	if (e->ops->elevator_completed_req_fn)
+		e->ops->elevator_completed_req_fn(q, rq);
 }
 
 int elv_register_queue(struct request_queue *q)
 {
-	elevator_t *e;
-
-	e = &q->elevator;
+	elevator_t *e = q->elevator;
 
 	e->kobj.parent = kobject_get(&q->kobj);
 	if (!e->kobj.parent)
 		return -EBUSY;
 
 	snprintf(e->kobj.name, KOBJ_NAME_LEN, "%s", "iosched");
-	e->kobj.ktype = e->elevator_ktype;
+	e->kobj.ktype = e->elevator_type->elevator_ktype;
 
 	return kobject_register(&e->kobj);
 }
@@ -382,12 +471,131 @@ int elv_register_queue(struct request_queue *q)
 void elv_unregister_queue(struct request_queue *q)
 {
 	if (q) {
-		elevator_t * e = &q->elevator;
+		elevator_t *e = q->elevator;
 		kobject_unregister(&e->kobj);
 		kobject_put(&q->kobj);
 	}
 }
 
+int elv_register(struct elevator_type *e)
+{
+	if (elevator_find(e->elevator_name))
+		BUG();
+
+	spin_lock_irq(&elv_list_lock);
+	list_add_tail(&e->list, &elv_list);
+	spin_unlock_irq(&elv_list_lock);
+
+	printk("io scheduler %s registered\n", e->elevator_name);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(elv_register);
+
+void elv_unregister(struct elevator_type *e)
+{
+	spin_lock_irq(&elv_list_lock);
+	list_del_init(&e->list);
+	spin_unlock_irq(&elv_list_lock);
+}
+EXPORT_SYMBOL_GPL(elv_unregister);
+
+/*
+ * switch to new_e io scheduler. be careful not to introduce deadlocks -
+ * we don't free the old io scheduler, before we have allocated what we
+ * need for the new one. this way we have a chance of going back to the old
+ * one, if the new one fails init for some reason
+ */
+static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
+{
+	elevator_t *e = kmalloc(sizeof(elevator_t), GFP_KERNEL);
+	elevator_t *old_elevator;
+
+	if (!e) {
+		printk("elevator: out of memory\n");
+		return;
+	}
+
+	blk_wait_queue_drained(q);
+
+	/*
+	 * unregister old elevator data
+	 */
+	elv_unregister_queue(q);
+	old_elevator = q->elevator;
+
+	/*
+	 * attach and start new elevator
+	 */
+	if (elevator_attach(q, new_e, e))
+		goto fail;
+
+	if (elv_register_queue(q))
+		goto fail_register;
+
+	/*
+	 * finally exit old elevator and start queue again
+	 */
+	elevator_exit(old_elevator);
+	blk_finish_queue_drain(q);
+	return;
+
+fail_register:
+	/*
+	 * switch failed, exit the new io scheduler and reattach the old
+	 * one again (along with re-adding the sysfs dir)
+	 */
+	elevator_exit(e);
+fail:
+	q->elevator = old_elevator;
+	elv_register_queue(q);
+	blk_finish_queue_drain(q);
+	printk("elevator: switch to %s failed\n", new_e->elevator_name);
+}
+
+ssize_t elv_iosched_store(request_queue_t *q, const char *name, size_t count)
+{
+	char elevator_name[ELV_NAME_MAX];
+	struct elevator_type *e;
+
+	memset(elevator_name, 0, sizeof(elevator_name));
+	strncpy(elevator_name, name, sizeof(elevator_name));
+
+	if (elevator_name[strlen(elevator_name) - 1] == '\n')
+		elevator_name[strlen(elevator_name) - 1] = '\0';
+
+	e = elevator_find(elevator_name);
+	if (!e) {
+		printk("elevator: type %s not found\n", elevator_name);
+		return -EINVAL;
+	}
+
+	elevator_switch(q, e);
+	return count;
+}
+
+ssize_t elv_iosched_show(request_queue_t *q, char *name)
+{
+	elevator_t *e = q->elevator;
+	struct elevator_type *elv = e->elevator_type;
+	struct list_head *entry;
+	int len = 0;
+
+	spin_lock_irq(q->queue_lock);
+	list_for_each(entry, &elv_list) {
+		struct elevator_type *__e;
+
+		__e = list_entry(entry, struct elevator_type, list);
+		if (!strcmp(elv->elevator_name, __e->elevator_name))
+			len += sprintf(name+len, "[%s] ", elv->elevator_name);
+		else
+			len += sprintf(name+len, "%s ", __e->elevator_name);
+	}
+	spin_unlock_irq(q->queue_lock);
+
+	len += sprintf(len+name, "\n");
+	return len;
+}
+
 module_init(elevator_global_init);
 
 EXPORT_SYMBOL(elv_add_request);
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 26fdf6be6bd0..3ba6430899df 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -243,6 +243,7 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 	blk_queue_hardsect_size(q, 512);
 	blk_queue_dma_alignment(q, 511);
 	blk_queue_congestion_threshold(q);
+	q->nr_batching = BLK_BATCH_REQ;
 
 	q->unplug_thresh = 4;		/* hmm */
 	q->unplug_delay = (3 * HZ) / 1000;	/* 3 milliseconds */
@@ -1395,7 +1396,8 @@ void blk_cleanup_queue(request_queue_t * q)
 	if (!atomic_dec_and_test(&q->refcnt))
 		return;
 
-	elevator_exit(q);
+	if (q->elevator)
+		elevator_exit(q->elevator);
 
 	del_timer_sync(&q->unplug_timer);
 	kblockd_flush();
@@ -1418,6 +1420,7 @@ static int blk_init_free_list(request_queue_t *q)
 	rl->count[READ] = rl->count[WRITE] = 0;
 	init_waitqueue_head(&rl->wait[READ]);
 	init_waitqueue_head(&rl->wait[WRITE]);
+	init_waitqueue_head(&rl->drain);
 
 	rl->rq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, request_cachep);
 
@@ -1429,45 +1432,6 @@ static int blk_init_free_list(request_queue_t *q)
 
 static int __make_request(request_queue_t *, struct bio *);
 
-static elevator_t *chosen_elevator =
-#if defined(CONFIG_IOSCHED_AS)
-	&iosched_as;
-#elif defined(CONFIG_IOSCHED_DEADLINE)
-	&iosched_deadline;
-#elif defined(CONFIG_IOSCHED_CFQ)
-	&iosched_cfq;
-#elif defined(CONFIG_IOSCHED_NOOP)
-	&elevator_noop;
-#else
-	NULL;
-#error "You must have at least 1 I/O scheduler selected"
-#endif
-
-#if defined(CONFIG_IOSCHED_AS) || defined(CONFIG_IOSCHED_DEADLINE) || defined (CONFIG_IOSCHED_NOOP)
-static int __init elevator_setup(char *str)
-{
-#ifdef CONFIG_IOSCHED_DEADLINE
-	if (!strcmp(str, "deadline"))
-		chosen_elevator = &iosched_deadline;
-#endif
-#ifdef CONFIG_IOSCHED_AS
-	if (!strcmp(str, "as"))
-		chosen_elevator = &iosched_as;
-#endif
-#ifdef CONFIG_IOSCHED_CFQ
-	if (!strcmp(str, "cfq"))
-		chosen_elevator = &iosched_cfq;
-#endif
-#ifdef CONFIG_IOSCHED_NOOP
-	if (!strcmp(str, "noop"))
-		chosen_elevator = &elevator_noop;
-#endif
-	return 1;
-}
-
-__setup("elevator=", elevator_setup);
-#endif /* CONFIG_IOSCHED_AS || CONFIG_IOSCHED_DEADLINE || CONFIG_IOSCHED_NOOP */
-
 request_queue_t *blk_alloc_queue(int gfp_mask)
 {
 	request_queue_t *q = kmem_cache_alloc(requestq_cachep, gfp_mask);
@@ -1520,21 +1484,14 @@ EXPORT_SYMBOL(blk_alloc_queue);
  **/
 request_queue_t *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
 {
-	request_queue_t *q;
-	static int printed;
+	request_queue_t *q = blk_alloc_queue(GFP_KERNEL);
 
-	q = blk_alloc_queue(GFP_KERNEL);
 	if (!q)
 		return NULL;
 
 	if (blk_init_free_list(q))
 		goto out_init;
 
-	if (!printed) {
-		printed = 1;
-		printk("Using %s io scheduler\n", chosen_elevator->elevator_name);
-	}
-
 	q->request_fn		= rfn;
 	q->back_merge_fn       	= ll_back_merge_fn;
 	q->front_merge_fn      	= ll_front_merge_fn;
@@ -1555,8 +1512,10 @@ request_queue_t *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
 	/*
 	 * all done
 	 */
-	if (!elevator_init(q, chosen_elevator))
+	if (!elevator_init(q, NULL)) {
+		blk_queue_congestion_threshold(q);
 		return q;
+	}
 
 	blk_cleanup_queue(q);
 out_init:
@@ -1584,13 +1543,20 @@ static inline void blk_free_request(request_queue_t *q, struct request *rq)
 	mempool_free(rq, q->rq.rq_pool);
 }
 
-static inline struct request *blk_alloc_request(request_queue_t *q,int gfp_mask)
+static inline struct request *blk_alloc_request(request_queue_t *q, int rw,
+						int gfp_mask)
 {
 	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
 
 	if (!rq)
 		return NULL;
 
+	/*
+	 * first three bits are identical in rq->flags and bio->bi_rw,
+	 * see bio.h and blkdev.h
+	 */
+	rq->flags = rw;
+
 	if (!elv_set_request(q, rq, gfp_mask))
 		return rq;
 
@@ -1602,7 +1568,7 @@ static inline struct request *blk_alloc_request(request_queue_t *q,int gfp_mask)
  * ioc_batching returns true if the ioc is a valid batching request and
  * should be given priority access to a request.
  */
-static inline int ioc_batching(struct io_context *ioc)
+static inline int ioc_batching(request_queue_t *q, struct io_context *ioc)
 {
 	if (!ioc)
 		return 0;
@@ -1612,7 +1578,7 @@ static inline int ioc_batching(struct io_context *ioc)
 	 * even if the batch times out, otherwise we could theoretically
 	 * lose wakeups.
 	 */
-	return ioc->nr_batch_requests == BLK_BATCH_REQ ||
+	return ioc->nr_batch_requests == q->nr_batching ||
 		(ioc->nr_batch_requests > 0
 		&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
 }
@@ -1623,12 +1589,12 @@ static inline int ioc_batching(struct io_context *ioc)
  * is the behaviour we want though - once it gets a wakeup it should be given
  * a nice run.
  */
-void ioc_set_batching(struct io_context *ioc)
+void ioc_set_batching(request_queue_t *q, struct io_context *ioc)
 {
-	if (!ioc || ioc_batching(ioc))
+	if (!ioc || ioc_batching(q, ioc))
 		return;
 
-	ioc->nr_batch_requests = BLK_BATCH_REQ;
+	ioc->nr_batch_requests = q->nr_batching;
 	ioc->last_waited = jiffies;
 }
 
@@ -1644,11 +1610,14 @@ static void freed_request(request_queue_t *q, int rw)
 	if (rl->count[rw] < queue_congestion_off_threshold(q))
 		clear_queue_congested(q, rw);
 	if (rl->count[rw]+1 <= q->nr_requests) {
+		smp_mb();
 		if (waitqueue_active(&rl->wait[rw]))
 			wake_up(&rl->wait[rw]);
-		if (!waitqueue_active(&rl->wait[rw]))
-			blk_clear_queue_full(q, rw);
+		blk_clear_queue_full(q, rw);
 	}
+	if (unlikely(waitqueue_active(&rl->drain)) &&
+	    !rl->count[READ] && !rl->count[WRITE])
+		wake_up(&rl->drain);
 }
 
 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
@@ -1661,6 +1630,9 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
 	struct request_list *rl = &q->rq;
 	struct io_context *ioc = get_io_context(gfp_mask);
 
+	if (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags)))
+		return NULL;
+
 	spin_lock_irq(q->queue_lock);
 	if (rl->count[rw]+1 >= q->nr_requests) {
 		/*
@@ -1670,13 +1642,22 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
 		 * will be blocked.
 		 */
 		if (!blk_queue_full(q, rw)) {
-			ioc_set_batching(ioc);
+			ioc_set_batching(q, ioc);
 			blk_set_queue_full(q, rw);
 		}
 	}
 
-	if (blk_queue_full(q, rw)
-			&& !ioc_batching(ioc) && !elv_may_queue(q, rw)) {
+	switch (elv_may_queue(q, rw)) {
+		case ELV_MQUEUE_NO:
+			spin_unlock_irq(q->queue_lock);
+			goto out;
+		case ELV_MQUEUE_MAY:
+			break;
+		case ELV_MQUEUE_MUST:
+			goto get_rq;
+	}
+
+	if (blk_queue_full(q, rw) && !ioc_batching(q, ioc)) {
 		/*
 		 * The queue is full and the allocating process is not a
 		 * "batcher", and not exempted by the IO scheduler
@@ -1685,12 +1666,13 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
 		goto out;
 	}
 
+get_rq:
 	rl->count[rw]++;
 	if (rl->count[rw] >= queue_congestion_on_threshold(q))
 		set_queue_congested(q, rw);
 	spin_unlock_irq(q->queue_lock);
 
-	rq = blk_alloc_request(q, gfp_mask);
+	rq = blk_alloc_request(q, rw, gfp_mask);
 	if (!rq) {
 		/*
 		 * Allocation failed presumably due to memory. Undo anything
@@ -1705,17 +1687,11 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
 		goto out;
 	}
 
-	if (ioc_batching(ioc))
+	if (ioc_batching(q, ioc))
 		ioc->nr_batch_requests--;
 	
 	INIT_LIST_HEAD(&rq->queuelist);
 
-	/*
-	 * first three bits are identical in rq->flags and bio->bi_rw,
-	 * see bio.h and blkdev.h
-	 */
-	rq->flags = rw;
-
 	rq->errors = 0;
 	rq->rq_status = RQ_ACTIVE;
 	rq->bio = rq->biotail = NULL;
@@ -1764,7 +1740,7 @@ static struct request *get_request_wait(request_queue_t *q, int rw)
 			 * See ioc_batching, ioc_set_batching
 			 */
 			ioc = get_io_context(GFP_NOIO);
-			ioc_set_batching(ioc);
+			ioc_set_batching(q, ioc);
 			put_io_context(ioc);
 		}
 		finish_wait(&rl->wait[rw], &wait);
@@ -2506,6 +2482,70 @@ static inline void blk_partition_remap(struct bio *bio)
 	}
 }
 
+void blk_finish_queue_drain(request_queue_t *q)
+{
+	struct request_list *rl = &q->rq;
+
+	clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
+	wake_up(&rl->wait[0]);
+	wake_up(&rl->wait[1]);
+	wake_up(&rl->drain);
+}
+
+/*
+ * We rely on the fact that only requests allocated through blk_alloc_request()
+ * have io scheduler private data structures associated with them. Any other
+ * type of request (allocated on stack or through kmalloc()) should not go
+ * to the io scheduler core, but be attached to the queue head instead.
+ */
+void blk_wait_queue_drained(request_queue_t *q)
+{
+	struct request_list *rl = &q->rq;
+	DEFINE_WAIT(wait);
+
+	spin_lock_irq(q->queue_lock);
+	set_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
+
+	while (rl->count[READ] || rl->count[WRITE]) {
+		prepare_to_wait(&rl->drain, &wait, TASK_UNINTERRUPTIBLE);
+
+		if (rl->count[READ] || rl->count[WRITE]) {
+			__generic_unplug_device(q);
+			spin_unlock_irq(q->queue_lock);
+			io_schedule();
+			spin_lock_irq(q->queue_lock);
+		}
+
+		finish_wait(&rl->drain, &wait);
+	}
+
+	spin_unlock_irq(q->queue_lock);
+}
+
+/*
+ * block waiting for the io scheduler being started again.
+ */
+static inline void block_wait_queue_running(request_queue_t *q)
+{
+	DEFINE_WAIT(wait);
+
+	while (test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags)) {
+		struct request_list *rl = &q->rq;
+
+		prepare_to_wait_exclusive(&rl->drain, &wait,
+				TASK_UNINTERRUPTIBLE);
+
+		/*
+		 * re-check the condition. avoids using prepare_to_wait()
+		 * in the fast path (queue is running)
+		 */
+		if (test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))
+			io_schedule();
+
+		finish_wait(&rl->drain, &wait);
+	}
+}
+
 /**
  * generic_make_request: hand a buffer to its device driver for I/O
  * @bio:  The bio describing the location in memory and on the device.
@@ -2595,6 +2635,8 @@ end_io:
 		if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))
 			goto end_io;
 
+		block_wait_queue_running(q);
+
 		/*
 		 * If this device has partitions, remap block n
 		 * of partition p to block n+start(p) of the disk.
@@ -3018,6 +3060,7 @@ void kblockd_flush(void)
 {
 	flush_workqueue(kblockd_workqueue);
 }
+EXPORT_SYMBOL(kblockd_flush);
 
 int __init blk_dev_init(void)
 {
@@ -3036,6 +3079,7 @@ int __init blk_dev_init(void)
 
 	blk_max_low_pfn = max_low_pfn;
 	blk_max_pfn = max_pfn;
+
 	return 0;
 }
 
@@ -3052,9 +3096,13 @@ void put_io_context(struct io_context *ioc)
 	if (atomic_dec_and_test(&ioc->refcount)) {
 		if (ioc->aic && ioc->aic->dtor)
 			ioc->aic->dtor(ioc->aic);
+		if (ioc->cic && ioc->cic->dtor)
+			ioc->cic->dtor(ioc->cic);
+
 		kmem_cache_free(iocontext_cachep, ioc);
 	}
 }
+EXPORT_SYMBOL(put_io_context);
 
 /* Called by the exitting task */
 void exit_io_context(void)
@@ -3064,14 +3112,15 @@ void exit_io_context(void)
 
 	local_irq_save(flags);
 	ioc = current->io_context;
-	if (ioc) {
-		if (ioc->aic && ioc->aic->exit)
-			ioc->aic->exit(ioc->aic);
-		put_io_context(ioc);
-		current->io_context = NULL;
-	} else
-		WARN_ON(1);
+	current->io_context = NULL;
 	local_irq_restore(flags);
+
+	if (ioc->aic && ioc->aic->exit)
+		ioc->aic->exit(ioc->aic);
+	if (ioc->cic && ioc->cic->exit)
+		ioc->cic->exit(ioc->cic);
+
+	put_io_context(ioc);
 }
 
 /*
@@ -3090,22 +3139,42 @@ struct io_context *get_io_context(int gfp_flags)
 
 	local_irq_save(flags);
 	ret = tsk->io_context;
-	if (ret == NULL) {
-		ret = kmem_cache_alloc(iocontext_cachep, GFP_ATOMIC);
-		if (ret) {
-			atomic_set(&ret->refcount, 1);
-			ret->pid = tsk->pid;
-			ret->last_waited = jiffies; /* doesn't matter... */
-			ret->nr_batch_requests = 0; /* because this is 0 */
-			ret->aic = NULL;
+	if (ret)
+		goto out;
+
+	local_irq_restore(flags);
+
+	ret = kmem_cache_alloc(iocontext_cachep, gfp_flags);
+	if (ret) {
+		atomic_set(&ret->refcount, 1);
+		ret->pid = tsk->pid;
+		ret->last_waited = jiffies; /* doesn't matter... */
+		ret->nr_batch_requests = 0; /* because this is 0 */
+		ret->aic = NULL;
+		ret->cic = NULL;
+		spin_lock_init(&ret->lock);
+
+		local_irq_save(flags);
+
+		/*
+		 * very unlikely, someone raced with us in setting up the task
+		 * io context. free new context and just grab a reference.
+		 */
+		if (!tsk->io_context)
 			tsk->io_context = ret;
+		else {
+			kmem_cache_free(iocontext_cachep, ret);
+			ret = tsk->io_context;
 		}
-	}
-	if (ret)
+
+out:
 		atomic_inc(&ret->refcount);
-	local_irq_restore(flags);
+		local_irq_restore(flags);
+	}
+
 	return ret;
 }
+EXPORT_SYMBOL(get_io_context);
 
 void copy_io_context(struct io_context **pdst, struct io_context **psrc)
 {
@@ -3119,6 +3188,7 @@ void copy_io_context(struct io_context **pdst, struct io_context **psrc)
 		*pdst = src;
 	}
 }
+EXPORT_SYMBOL(copy_io_context);
 
 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2)
 {
@@ -3127,7 +3197,7 @@ void swap_io_context(struct io_context **ioc1, struct io_context **ioc2)
 	*ioc1 = *ioc2;
 	*ioc2 = temp;
 }
-
+EXPORT_SYMBOL(swap_io_context);
 
 /*
  * sysfs parts below
@@ -3285,11 +3355,18 @@ static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
 	.show = queue_max_hw_sectors_show,
 };
 
+static struct queue_sysfs_entry queue_iosched_entry = {
+	.attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
+	.show = elv_iosched_show,
+	.store = elv_iosched_store,
+};
+
 static struct attribute *default_attrs[] = {
 	&queue_requests_entry.attr,
 	&queue_ra_entry.attr,
 	&queue_max_hw_sectors_entry.attr,
 	&queue_max_sectors_entry.attr,
+	&queue_iosched_entry.attr,
 	NULL,
 };
 
diff --git a/drivers/block/noop-iosched.c b/drivers/block/noop-iosched.c
index ffef40be1f92..707dddd7d881 100644
--- a/drivers/block/noop-iosched.c
+++ b/drivers/block/noop-iosched.c
@@ -83,12 +83,31 @@ struct request *elevator_noop_next_request(request_queue_t *q)
 	return NULL;
 }
 
-elevator_t elevator_noop = {
-	.elevator_merge_fn		= elevator_noop_merge,
-	.elevator_merge_req_fn		= elevator_noop_merge_requests,
-	.elevator_next_req_fn		= elevator_noop_next_request,
-	.elevator_add_req_fn		= elevator_noop_add_request,
-	.elevator_name			= "noop",
+static struct elevator_type elevator_noop = {
+	.ops = {
+		.elevator_merge_fn		= elevator_noop_merge,
+		.elevator_merge_req_fn		= elevator_noop_merge_requests,
+		.elevator_next_req_fn		= elevator_noop_next_request,
+		.elevator_add_req_fn		= elevator_noop_add_request,
+	},
+	.elevator_name = "noop",
+	.elevator_owner = THIS_MODULE,
 };
 
-EXPORT_SYMBOL(elevator_noop);
+int noop_init(void)
+{
+	return elv_register(&elevator_noop);
+}
+
+void noop_exit(void)
+{
+	elv_unregister(&elevator_noop);
+}
+
+module_init(noop_init);
+module_exit(noop_exit);
+
+
+MODULE_AUTHOR("Jens Axboe");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("No-op IO scheduler");
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
new file mode 100644
index 000000000000..fb80b6a91f84
--- /dev/null
+++ b/drivers/block/pktcdvd.c
@@ -0,0 +1,2679 @@
+/*
+ * Copyright (C) 2000 Jens Axboe <axboe@suse.de>
+ * Copyright (C) 2001-2004 Peter Osterlund <petero2@telia.com>
+ *
+ * May be copied or modified under the terms of the GNU General Public
+ * License.  See linux/COPYING for more information.
+ *
+ * Packet writing layer for ATAPI and SCSI CD-R, CD-RW, DVD-R, and
+ * DVD-RW devices (aka an exercise in block layer masturbation)
+ *
+ *
+ * TODO: (circa order of when I will fix it)
+ * - Only able to write on CD-RW media right now.
+ * - check host application code on media and set it in write page
+ * - interface for UDF <-> packet to negotiate a new location when a write
+ *   fails.
+ * - handle OPC, especially for -RW media
+ *
+ * Theory of operation:
+ *
+ * We use a custom make_request_fn function that forwards reads directly to
+ * the underlying CD device. Write requests are either attached directly to
+ * a live packet_data object, or simply stored sequentially in a list for
+ * later processing by the kcdrwd kernel thread. This driver doesn't use
+ * any elevator functionally as defined by the elevator_s struct, but the
+ * underlying CD device uses a standard elevator.
+ *
+ * This strategy makes it possible to do very late merging of IO requests.
+ * A new bio sent to pkt_make_request can be merged with a live packet_data
+ * object even if the object is in the data gathering state.
+ *
+ *************************************************************************/
+
+#define VERSION_CODE	"v0.2.0a 2004-07-14 Jens Axboe (axboe@suse.de) and petero2@telia.com"
+
+#include <linux/pktcdvd.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/miscdevice.h>
+#include <linux/suspend.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_ioctl.h>
+
+#include <asm/uaccess.h>
+
+#if PACKET_DEBUG
+#define DPRINTK(fmt, args...) printk(KERN_NOTICE fmt, ##args)
+#else
+#define DPRINTK(fmt, args...)
+#endif
+
+#if PACKET_DEBUG > 1
+#define VPRINTK(fmt, args...) printk(KERN_NOTICE fmt, ##args)
+#else
+#define VPRINTK(fmt, args...)
+#endif
+
+#define MAX_SPEED 0xffff
+
+#define ZONE(sector, pd) (((sector) + (pd)->offset) & ~((pd)->settings.size - 1))
+
+static struct pktcdvd_device *pkt_devs[MAX_WRITERS];
+static struct proc_dir_entry *pkt_proc;
+static int pkt_major;
+static struct semaphore ctl_mutex;	/* Serialize open/close/setup/teardown */
+static mempool_t *psd_pool;
+
+
+static void pkt_bio_finished(struct pktcdvd_device *pd)
+{
+	BUG_ON(atomic_read(&pd->cdrw.pending_bios) <= 0);
+	if (atomic_dec_and_test(&pd->cdrw.pending_bios)) {
+		VPRINTK("pktcdvd: queue empty\n");
+		atomic_set(&pd->iosched.attention, 1);
+		wake_up(&pd->wqueue);
+	}
+}
+
+static void pkt_bio_destructor(struct bio *bio)
+{
+	kfree(bio->bi_io_vec);
+	kfree(bio);
+}
+
+static struct bio *pkt_bio_alloc(int nr_iovecs)
+{
+	struct bio_vec *bvl = NULL;
+	struct bio *bio;
+
+	bio = kmalloc(sizeof(struct bio), GFP_KERNEL);
+	if (!bio)
+		goto no_bio;
+	bio_init(bio);
+
+	bvl = kmalloc(nr_iovecs * sizeof(struct bio_vec), GFP_KERNEL);
+	if (!bvl)
+		goto no_bvl;
+	memset(bvl, 0, nr_iovecs * sizeof(struct bio_vec));
+
+	bio->bi_max_vecs = nr_iovecs;
+	bio->bi_io_vec = bvl;
+	bio->bi_destructor = pkt_bio_destructor;
+
+	return bio;
+
+ no_bvl:
+	kfree(bio);
+ no_bio:
+	return NULL;
+}
+
+/*
+ * Allocate a packet_data struct
+ */
+static struct packet_data *pkt_alloc_packet_data(void)
+{
+	int i;
+	struct packet_data *pkt;
+
+	pkt = kmalloc(sizeof(struct packet_data), GFP_KERNEL);
+	if (!pkt)
+		goto no_pkt;
+	memset(pkt, 0, sizeof(struct packet_data));
+
+	pkt->w_bio = pkt_bio_alloc(PACKET_MAX_SIZE);
+	if (!pkt->w_bio)
+		goto no_bio;
+
+	for (i = 0; i < PAGES_PER_PACKET; i++) {
+		pkt->pages[i] = alloc_page(GFP_KERNEL);
+		if (!pkt->pages[i])
+			goto no_page;
+	}
+	for (i = 0; i < PAGES_PER_PACKET; i++)
+		clear_page(page_address(pkt->pages[i]));
+
+	spin_lock_init(&pkt->lock);
+
+	for (i = 0; i < PACKET_MAX_SIZE; i++) {
+		struct bio *bio = pkt_bio_alloc(1);
+		if (!bio)
+			goto no_rd_bio;
+		pkt->r_bios[i] = bio;
+	}
+
+	return pkt;
+
+no_rd_bio:
+	for (i = 0; i < PACKET_MAX_SIZE; i++) {
+		struct bio *bio = pkt->r_bios[i];
+		if (bio)
+			bio_put(bio);
+	}
+
+no_page:
+	for (i = 0; i < PAGES_PER_PACKET; i++)
+		if (pkt->pages[i])
+			__free_page(pkt->pages[i]);
+	bio_put(pkt->w_bio);
+no_bio:
+	kfree(pkt);
+no_pkt:
+	return NULL;
+}
+
+/*
+ * Free a packet_data struct
+ */
+static void pkt_free_packet_data(struct packet_data *pkt)
+{
+	int i;
+
+	for (i = 0; i < PACKET_MAX_SIZE; i++) {
+		struct bio *bio = pkt->r_bios[i];
+		if (bio)
+			bio_put(bio);
+	}
+	for (i = 0; i < PAGES_PER_PACKET; i++)
+		__free_page(pkt->pages[i]);
+	bio_put(pkt->w_bio);
+	kfree(pkt);
+}
+
+static void pkt_shrink_pktlist(struct pktcdvd_device *pd)
+{
+	struct packet_data *pkt, *next;
+
+	BUG_ON(!list_empty(&pd->cdrw.pkt_active_list));
+
+	list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_free_list, list) {
+		pkt_free_packet_data(pkt);
+	}
+}
+
+static int pkt_grow_pktlist(struct pktcdvd_device *pd, int nr_packets)
+{
+	struct packet_data *pkt;
+
+	INIT_LIST_HEAD(&pd->cdrw.pkt_free_list);
+	INIT_LIST_HEAD(&pd->cdrw.pkt_active_list);
+	spin_lock_init(&pd->cdrw.active_list_lock);
+	while (nr_packets > 0) {
+		pkt = pkt_alloc_packet_data();
+		if (!pkt) {
+			pkt_shrink_pktlist(pd);
+			return 0;
+		}
+		pkt->id = nr_packets;
+		pkt->pd = pd;
+		list_add(&pkt->list, &pd->cdrw.pkt_free_list);
+		nr_packets--;
+	}
+	return 1;
+}
+
+static void *pkt_rb_alloc(int gfp_mask, void *data)
+{
+	return kmalloc(sizeof(struct pkt_rb_node), gfp_mask);
+}
+
+static void pkt_rb_free(void *ptr, void *data)
+{
+	kfree(ptr);
+}
+
+static inline struct pkt_rb_node *pkt_rbtree_next(struct pkt_rb_node *node)
+{
+	struct rb_node *n = rb_next(&node->rb_node);
+	if (!n)
+		return NULL;
+	return rb_entry(n, struct pkt_rb_node, rb_node);
+}
+
+static inline void pkt_rbtree_erase(struct pktcdvd_device *pd, struct pkt_rb_node *node)
+{
+	rb_erase(&node->rb_node, &pd->bio_queue);
+	mempool_free(node, pd->rb_pool);
+	pd->bio_queue_size--;
+	BUG_ON(pd->bio_queue_size < 0);
+}
+
+/*
+ * Find the first node in the pd->bio_queue rb tree with a starting sector >= s.
+ */
+static struct pkt_rb_node *pkt_rbtree_find(struct pktcdvd_device *pd, sector_t s)
+{
+	struct rb_node *n = pd->bio_queue.rb_node;
+	struct rb_node *next;
+	struct pkt_rb_node *tmp;
+
+	if (!n) {
+		BUG_ON(pd->bio_queue_size > 0);
+		return NULL;
+	}
+
+	for (;;) {
+		tmp = rb_entry(n, struct pkt_rb_node, rb_node);
+		if (s <= tmp->bio->bi_sector)
+			next = n->rb_left;
+		else
+			next = n->rb_right;
+		if (!next)
+			break;
+		n = next;
+	}
+
+	if (s > tmp->bio->bi_sector) {
+		tmp = pkt_rbtree_next(tmp);
+		if (!tmp)
+			return NULL;
+	}
+	BUG_ON(s > tmp->bio->bi_sector);
+	return tmp;
+}
+
+/*
+ * Insert a node into the pd->bio_queue rb tree.
+ */
+static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *node)
+{
+	struct rb_node **p = &pd->bio_queue.rb_node;
+	struct rb_node *parent = NULL;
+	sector_t s = node->bio->bi_sector;
+	struct pkt_rb_node *tmp;
+
+	while (*p) {
+		parent = *p;
+		tmp = rb_entry(parent, struct pkt_rb_node, rb_node);
+		if (s < tmp->bio->bi_sector)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+	rb_link_node(&node->rb_node, parent, p);
+	rb_insert_color(&node->rb_node, &pd->bio_queue);
+	pd->bio_queue_size++;
+}
+
+/*
+ * Add a bio to a single linked list defined by its head and tail pointers.
+ */
+static inline void pkt_add_list_last(struct bio *bio, struct bio **list_head, struct bio **list_tail)
+{
+	bio->bi_next = NULL;
+	if (*list_tail) {
+		BUG_ON((*list_head) == NULL);
+		(*list_tail)->bi_next = bio;
+		(*list_tail) = bio;
+	} else {
+		BUG_ON((*list_head) != NULL);
+		(*list_head) = bio;
+		(*list_tail) = bio;
+	}
+}
+
+/*
+ * Remove and return the first bio from a single linked list defined by its
+ * head and tail pointers.
+ */
+static inline struct bio *pkt_get_list_first(struct bio **list_head, struct bio **list_tail)
+{
+	struct bio *bio;
+
+	if (*list_head == NULL)
+		return NULL;
+
+	bio = *list_head;
+	*list_head = bio->bi_next;
+	if (*list_head == NULL)
+		*list_tail = NULL;
+
+	bio->bi_next = NULL;
+	return bio;
+}
+
+/*
+ * Send a packet_command to the underlying block device and
+ * wait for completion.
+ */
+static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *cgc)
+{
+	char sense[SCSI_SENSE_BUFFERSIZE];
+	request_queue_t *q;
+	struct request *rq;
+	DECLARE_COMPLETION(wait);
+	int err = 0;
+
+	q = bdev_get_queue(pd->bdev);
+
+	rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? WRITE : READ,
+			     __GFP_WAIT);
+	rq->errors = 0;
+	rq->rq_disk = pd->bdev->bd_disk;
+	rq->bio = NULL;
+	rq->buffer = NULL;
+	rq->timeout = 60*HZ;
+	rq->data = cgc->buffer;
+	rq->data_len = cgc->buflen;
+	rq->sense = sense;
+	memset(sense, 0, sizeof(sense));
+	rq->sense_len = 0;
+	rq->flags |= REQ_BLOCK_PC | REQ_HARDBARRIER;
+	if (cgc->quiet)
+		rq->flags |= REQ_QUIET;
+	memcpy(rq->cmd, cgc->cmd, CDROM_PACKET_SIZE);
+	if (sizeof(rq->cmd) > CDROM_PACKET_SIZE)
+		memset(rq->cmd + CDROM_PACKET_SIZE, 0, sizeof(rq->cmd) - CDROM_PACKET_SIZE);
+
+	rq->ref_count++;
+	rq->flags |= REQ_NOMERGE;
+	rq->waiting = &wait;
+	elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1);
+	generic_unplug_device(q);
+	wait_for_completion(&wait);
+
+	if (rq->errors)
+		err = -EIO;
+
+	blk_put_request(rq);
+	return err;
+}
+
+/*
+ * A generic sense dump / resolve mechanism should be implemented across
+ * all ATAPI + SCSI devices.
+ */
+static void pkt_dump_sense(struct packet_command *cgc)
+{
+	static char *info[9] = { "No sense", "Recovered error", "Not ready",
+				 "Medium error", "Hardware error", "Illegal request",
+				 "Unit attention", "Data protect", "Blank check" };
+	int i;
+	struct request_sense *sense = cgc->sense;
+
+	printk("pktcdvd:");
+	for (i = 0; i < CDROM_PACKET_SIZE; i++)
+		printk(" %02x", cgc->cmd[i]);
+	printk(" - ");
+
+	if (sense == NULL) {
+		printk("no sense\n");
+		return;
+	}
+
+	printk("sense %02x.%02x.%02x", sense->sense_key, sense->asc, sense->ascq);
+
+	if (sense->sense_key > 8) {
+		printk(" (INVALID)\n");
+		return;
+	}
+
+	printk(" (%s)\n", info[sense->sense_key]);
+}
+
+/*
+ * flush the drive cache to media
+ */
+static int pkt_flush_cache(struct pktcdvd_device *pd)
+{
+	struct packet_command cgc;
+
+	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
+	cgc.cmd[0] = GPCMD_FLUSH_CACHE;
+	cgc.quiet = 1;
+
+	/*
+	 * the IMMED bit -- we default to not setting it, although that
+	 * would allow a much faster close, this is safer
+	 */
+#if 0
+	cgc.cmd[1] = 1 << 1;
+#endif
+	return pkt_generic_packet(pd, &cgc);
+}
+
+/*
+ * speed is given as the normal factor, e.g. 4 for 4x
+ */
+static int pkt_set_speed(struct pktcdvd_device *pd, unsigned write_speed, unsigned read_speed)
+{
+	struct packet_command cgc;
+	struct request_sense sense;
+	int ret;
+
+	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
+	cgc.sense = &sense;
+	cgc.cmd[0] = GPCMD_SET_SPEED;
+	cgc.cmd[2] = (read_speed >> 8) & 0xff;
+	cgc.cmd[3] = read_speed & 0xff;
+	cgc.cmd[4] = (write_speed >> 8) & 0xff;
+	cgc.cmd[5] = write_speed & 0xff;
+
+	if ((ret = pkt_generic_packet(pd, &cgc)))
+		pkt_dump_sense(&cgc);
+
+	return ret;
+}
+
+/*
+ * Queue a bio for processing by the low-level CD device. Must be called
+ * from process context.
+ */
+static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio, int high_prio_read)
+{
+	spin_lock(&pd->iosched.lock);
+	if (bio_data_dir(bio) == READ) {
+		pkt_add_list_last(bio, &pd->iosched.read_queue,
+				  &pd->iosched.read_queue_tail);
+		if (high_prio_read)
+			pd->iosched.high_prio_read = 1;
+	} else {
+		pkt_add_list_last(bio, &pd->iosched.write_queue,
+				  &pd->iosched.write_queue_tail);
+	}
+	spin_unlock(&pd->iosched.lock);
+
+	atomic_set(&pd->iosched.attention, 1);
+	wake_up(&pd->wqueue);
+}
+
+/*
+ * Process the queued read/write requests. This function handles special
+ * requirements for CDRW drives:
+ * - A cache flush command must be inserted before a read request if the
+ *   previous request was a write.
+ * - Switching between reading and writing is slow, so don't it more often
+ *   than necessary.
+ * - Set the read speed according to current usage pattern. When only reading
+ *   from the device, it's best to use the highest possible read speed, but
+ *   when switching often between reading and writing, it's better to have the
+ *   same read and write speeds.
+ * - Reads originating from user space should have higher priority than reads
+ *   originating from pkt_gather_data, because some process is usually waiting
+ *   on reads of the first kind.
+ */
+static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
+{
+	request_queue_t *q;
+
+	if (atomic_read(&pd->iosched.attention) == 0)
+		return;
+	atomic_set(&pd->iosched.attention, 0);
+
+	q = bdev_get_queue(pd->bdev);
+
+	for (;;) {
+		struct bio *bio;
+		int reads_queued, writes_queued, high_prio_read;
+
+		spin_lock(&pd->iosched.lock);
+		reads_queued = (pd->iosched.read_queue != NULL);
+		writes_queued = (pd->iosched.write_queue != NULL);
+		if (!reads_queued)
+			pd->iosched.high_prio_read = 0;
+		high_prio_read = pd->iosched.high_prio_read;
+		spin_unlock(&pd->iosched.lock);
+
+		if (!reads_queued && !writes_queued)
+			break;
+
+		if (pd->iosched.writing) {
+			if (high_prio_read || (!writes_queued && reads_queued)) {
+				if (atomic_read(&pd->cdrw.pending_bios) > 0) {
+					VPRINTK("pktcdvd: write, waiting\n");
+					break;
+				}
+				pkt_flush_cache(pd);
+				pd->iosched.writing = 0;
+			}
+		} else {
+			if (!reads_queued && writes_queued) {
+				if (atomic_read(&pd->cdrw.pending_bios) > 0) {
+					VPRINTK("pktcdvd: read, waiting\n");
+					break;
+				}
+				pd->iosched.writing = 1;
+			}
+		}
+
+		spin_lock(&pd->iosched.lock);
+		if (pd->iosched.writing) {
+			bio = pkt_get_list_first(&pd->iosched.write_queue,
+						 &pd->iosched.write_queue_tail);
+		} else {
+			bio = pkt_get_list_first(&pd->iosched.read_queue,
+						 &pd->iosched.read_queue_tail);
+		}
+		spin_unlock(&pd->iosched.lock);
+
+		if (!bio)
+			continue;
+
+		if (bio_data_dir(bio) == READ)
+			pd->iosched.successive_reads += bio->bi_size >> 10;
+		else
+			pd->iosched.successive_reads = 0;
+		if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) {
+			if (pd->read_speed == pd->write_speed) {
+				pd->read_speed = MAX_SPEED;
+				pkt_set_speed(pd, pd->write_speed, pd->read_speed);
+			}
+		} else {
+			if (pd->read_speed != pd->write_speed) {
+				pd->read_speed = pd->write_speed;
+				pkt_set_speed(pd, pd->write_speed, pd->read_speed);
+			}
+		}
+
+		atomic_inc(&pd->cdrw.pending_bios);
+		generic_make_request(bio);
+	}
+}
+
+/*
+ * Special care is needed if the underlying block device has a small
+ * max_phys_segments value.
+ */
+static int pkt_set_segment_merging(struct pktcdvd_device *pd, request_queue_t *q)
+{
+	if ((pd->settings.size << 9) / CD_FRAMESIZE <= q->max_phys_segments) {
+		/*
+		 * The cdrom device can handle one segment/frame
+		 */
+		clear_bit(PACKET_MERGE_SEGS, &pd->flags);
+		return 0;
+	} else if ((pd->settings.size << 9) / PAGE_SIZE <= q->max_phys_segments) {
+		/*
+		 * We can handle this case at the expense of some extra memory
+		 * copies during write operations
+		 */
+		set_bit(PACKET_MERGE_SEGS, &pd->flags);
+		return 0;
+	} else {
+		printk("pktcdvd: cdrom max_phys_segments too small\n");
+		return -EIO;
+	}
+}
+
+/*
+ * Copy CD_FRAMESIZE bytes from src_bio into a destination page
+ */
+static void pkt_copy_bio_data(struct bio *src_bio, int seg, int offs,
+			      struct page *dst_page, int dst_offs)
+{
+	unsigned int copy_size = CD_FRAMESIZE;
+
+	while (copy_size > 0) {
+		struct bio_vec *src_bvl = bio_iovec_idx(src_bio, seg);
+		void *vfrom = kmap_atomic(src_bvl->bv_page, KM_USER0) +
+			src_bvl->bv_offset + offs;
+		void *vto = page_address(dst_page) + dst_offs;
+		int len = min_t(int, copy_size, src_bvl->bv_len - offs);
+
+		BUG_ON(len < 0);
+		memcpy(vto, vfrom, len);
+		kunmap_atomic(src_bvl->bv_page, KM_USER0);
+
+		seg++;
+		offs = 0;
+		dst_offs += len;
+		copy_size -= len;
+	}
+}
+
+/*
+ * Copy all data for this packet to pkt->pages[], so that
+ * a) The number of required segments for the write bio is minimized, which
+ *    is necessary for some scsi controllers.
+ * b) The data can be used as cache to avoid read requests if we receive a
+ *    new write request for the same zone.
+ */
+static void pkt_make_local_copy(struct packet_data *pkt, struct page **pages, int *offsets)
+{
+	int f, p, offs;
+
+	/* Copy all data to pkt->pages[] */
+	p = 0;
+	offs = 0;
+	for (f = 0; f < pkt->frames; f++) {
+		if (pages[f] != pkt->pages[p]) {
+			void *vfrom = kmap_atomic(pages[f], KM_USER0) + offsets[f];
+			void *vto = page_address(pkt->pages[p]) + offs;
+			memcpy(vto, vfrom, CD_FRAMESIZE);
+			kunmap_atomic(pages[f], KM_USER0);
+			pages[f] = pkt->pages[p];
+			offsets[f] = offs;
+		} else {
+			BUG_ON(offsets[f] != offs);
+		}
+		offs += CD_FRAMESIZE;
+		if (offs >= PAGE_SIZE) {
+			BUG_ON(offs > PAGE_SIZE);
+			offs = 0;
+			p++;
+		}
+	}
+}
+
+static int pkt_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
+{
+	struct packet_data *pkt = bio->bi_private;
+	struct pktcdvd_device *pd = pkt->pd;
+	BUG_ON(!pd);
+
+	if (bio->bi_size)
+		return 1;
+
+	VPRINTK("pkt_end_io_read: bio=%p sec0=%llx sec=%llx err=%d\n", bio,
+		(unsigned long long)pkt->sector, (unsigned long long)bio->bi_sector, err);
+
+	if (err)
+		atomic_inc(&pkt->io_errors);
+	if (atomic_dec_and_test(&pkt->io_wait)) {
+		atomic_inc(&pkt->run_sm);
+		wake_up(&pd->wqueue);
+	}
+	pkt_bio_finished(pd);
+
+	return 0;
+}
+
+static int pkt_end_io_packet_write(struct bio *bio, unsigned int bytes_done, int err)
+{
+	struct packet_data *pkt = bio->bi_private;
+	struct pktcdvd_device *pd = pkt->pd;
+	BUG_ON(!pd);
+
+	if (bio->bi_size)
+		return 1;
+
+	VPRINTK("pkt_end_io_packet_write: id=%d, err=%d\n", pkt->id, err);
+
+	pd->stats.pkt_ended++;
+
+	pkt_bio_finished(pd);
+	atomic_dec(&pkt->io_wait);
+	atomic_inc(&pkt->run_sm);
+	wake_up(&pd->wqueue);
+	return 0;
+}
+
+/*
+ * Schedule reads for the holes in a packet
+ */
+static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
+{
+	int frames_read = 0;
+	struct bio *bio;
+	int f;
+	char written[PACKET_MAX_SIZE];
+
+	BUG_ON(!pkt->orig_bios);
+
+	atomic_set(&pkt->io_wait, 0);
+	atomic_set(&pkt->io_errors, 0);
+
+	if (pkt->cache_valid) {
+		VPRINTK("pkt_gather_data: zone %llx cached\n",
+			(unsigned long long)pkt->sector);
+		goto out_account;
+	}
+
+	/*
+	 * Figure out which frames we need to read before we can write.
+	 */
+	memset(written, 0, sizeof(written));
+	spin_lock(&pkt->lock);
+	for (bio = pkt->orig_bios; bio; bio = bio->bi_next) {
+		int first_frame = (bio->bi_sector - pkt->sector) / (CD_FRAMESIZE >> 9);
+		int num_frames = bio->bi_size / CD_FRAMESIZE;
+		BUG_ON(first_frame < 0);
+		BUG_ON(first_frame + num_frames > pkt->frames);
+		for (f = first_frame; f < first_frame + num_frames; f++)
+			written[f] = 1;
+	}
+	spin_unlock(&pkt->lock);
+
+	/*
+	 * Schedule reads for missing parts of the packet.
+	 */
+	for (f = 0; f < pkt->frames; f++) {
+		int p, offset;
+		if (written[f])
+			continue;
+		bio = pkt->r_bios[f];
+		bio_init(bio);
+		bio->bi_max_vecs = 1;
+		bio->bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9);
+		bio->bi_bdev = pd->bdev;
+		bio->bi_end_io = pkt_end_io_read;
+		bio->bi_private = pkt;
+
+		p = (f * CD_FRAMESIZE) / PAGE_SIZE;
+		offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
+		VPRINTK("pkt_gather_data: Adding frame %d, page:%p offs:%d\n",
+			f, pkt->pages[p], offset);
+		if (!bio_add_page(bio, pkt->pages[p], CD_FRAMESIZE, offset))
+			BUG();
+
+		atomic_inc(&pkt->io_wait);
+		bio->bi_rw = READ;
+		pkt_queue_bio(pd, bio, 0);
+		frames_read++;
+	}
+
+out_account:
+	VPRINTK("pkt_gather_data: need %d frames for zone %llx\n",
+		frames_read, (unsigned long long)pkt->sector);
+	pd->stats.pkt_started++;
+	pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9);
+	pd->stats.secs_w += pd->settings.size;
+}
+
+/*
+ * Find a packet matching zone, or the least recently used packet if
+ * there is no match.
+ */
+static struct packet_data *pkt_get_packet_data(struct pktcdvd_device *pd, int zone)
+{
+	struct packet_data *pkt;
+
+	list_for_each_entry(pkt, &pd->cdrw.pkt_free_list, list) {
+		if (pkt->sector == zone || pkt->list.next == &pd->cdrw.pkt_free_list) {
+			list_del_init(&pkt->list);
+			if (pkt->sector != zone)
+				pkt->cache_valid = 0;
+			break;
+		}
+	}
+	return pkt;
+}
+
+static void pkt_put_packet_data(struct pktcdvd_device *pd, struct packet_data *pkt)
+{
+	if (pkt->cache_valid) {
+		list_add(&pkt->list, &pd->cdrw.pkt_free_list);
+	} else {
+		list_add_tail(&pkt->list, &pd->cdrw.pkt_free_list);
+	}
+}
+
+/*
+ * recover a failed write, query for relocation if possible
+ *
+ * returns 1 if recovery is possible, or 0 if not
+ *
+ */
+static int pkt_start_recovery(struct packet_data *pkt)
+{
+	/*
+	 * FIXME. We need help from the file system to implement
+	 * recovery handling.
+	 */
+	return 0;
+#if 0
+	struct request *rq = pkt->rq;
+	struct pktcdvd_device *pd = rq->rq_disk->private_data;
+	struct block_device *pkt_bdev;
+	struct super_block *sb = NULL;
+	unsigned long old_block, new_block;
+	sector_t new_sector;
+
+	pkt_bdev = bdget(kdev_t_to_nr(pd->pkt_dev));
+	if (pkt_bdev) {
+		sb = get_super(pkt_bdev);
+		bdput(pkt_bdev);
+	}
+
+	if (!sb)
+		return 0;
+
+	if (!sb->s_op || !sb->s_op->relocate_blocks)
+		goto out;
+
+	old_block = pkt->sector / (CD_FRAMESIZE >> 9);
+	if (sb->s_op->relocate_blocks(sb, old_block, &new_block))
+		goto out;
+
+	new_sector = new_block * (CD_FRAMESIZE >> 9);
+	pkt->sector = new_sector;
+
+	pkt->bio->bi_sector = new_sector;
+	pkt->bio->bi_next = NULL;
+	pkt->bio->bi_flags = 1 << BIO_UPTODATE;
+	pkt->bio->bi_idx = 0;
+
+	BUG_ON(pkt->bio->bi_rw != (1 << BIO_RW));
+	BUG_ON(pkt->bio->bi_vcnt != pkt->frames);
+	BUG_ON(pkt->bio->bi_size != pkt->frames * CD_FRAMESIZE);
+	BUG_ON(pkt->bio->bi_end_io != pkt_end_io_packet_write);
+	BUG_ON(pkt->bio->bi_private != pkt);
+
+	drop_super(sb);
+	return 1;
+
+out:
+	drop_super(sb);
+	return 0;
+#endif
+}
+
+static inline void pkt_set_state(struct packet_data *pkt, enum packet_data_state state)
+{
+#if PACKET_DEBUG > 1
+	static const char *state_name[] = {
+		"IDLE", "WAITING", "READ_WAIT", "WRITE_WAIT", "RECOVERY", "FINISHED"
+	};
+	enum packet_data_state old_state = pkt->state;
+	VPRINTK("pkt %2d : s=%6llx %s -> %s\n", pkt->id, (unsigned long long)pkt->sector,
+		state_name[old_state], state_name[state]);
+#endif
+	pkt->state = state;
+}
+
+/*
+ * Scan the work queue to see if we can start a new packet.
+ * returns non-zero if any work was done.
+ */
+static int pkt_handle_queue(struct pktcdvd_device *pd)
+{
+	struct packet_data *pkt, *p;
+	struct bio *bio = NULL;
+	sector_t zone = 0; /* Suppress gcc warning */
+	struct pkt_rb_node *node, *first_node;
+	struct rb_node *n;
+
+	VPRINTK("handle_queue\n");
+
+	atomic_set(&pd->scan_queue, 0);
+
+	if (list_empty(&pd->cdrw.pkt_free_list)) {
+		VPRINTK("handle_queue: no pkt\n");
+		return 0;
+	}
+
+	/*
+	 * Try to find a zone we are not already working on.
+	 */
+	spin_lock(&pd->lock);
+	first_node = pkt_rbtree_find(pd, pd->current_sector);
+	if (!first_node) {
+		n = rb_first(&pd->bio_queue);
+		if (n)
+			first_node = rb_entry(n, struct pkt_rb_node, rb_node);
+	}
+	node = first_node;
+	while (node) {
+		bio = node->bio;
+		zone = ZONE(bio->bi_sector, pd);
+		list_for_each_entry(p, &pd->cdrw.pkt_active_list, list) {
+			if (p->sector == zone)
+				goto try_next_bio;
+		}
+		break;
+try_next_bio:
+		node = pkt_rbtree_next(node);
+		if (!node) {
+			n = rb_first(&pd->bio_queue);
+			if (n)
+				node = rb_entry(n, struct pkt_rb_node, rb_node);
+		}
+		if (node == first_node)
+			node = NULL;
+	}
+	spin_unlock(&pd->lock);
+	if (!bio) {
+		VPRINTK("handle_queue: no bio\n");
+		return 0;
+	}
+
+	pkt = pkt_get_packet_data(pd, zone);
+	BUG_ON(!pkt);
+
+	pd->current_sector = zone + pd->settings.size;
+	pkt->sector = zone;
+	pkt->frames = pd->settings.size >> 2;
+	BUG_ON(pkt->frames > PACKET_MAX_SIZE);
+	pkt->write_size = 0;
+
+	/*
+	 * Scan work queue for bios in the same zone and link them
+	 * to this packet.
+	 */
+	spin_lock(&pd->lock);
+	VPRINTK("pkt_handle_queue: looking for zone %llx\n", (unsigned long long)zone);
+	while ((node = pkt_rbtree_find(pd, zone)) != NULL) {
+		bio = node->bio;
+		VPRINTK("pkt_handle_queue: found zone=%llx\n",
+			(unsigned long long)ZONE(bio->bi_sector, pd));
+		if (ZONE(bio->bi_sector, pd) != zone)
+			break;
+		pkt_rbtree_erase(pd, node);
+		spin_lock(&pkt->lock);
+		pkt_add_list_last(bio, &pkt->orig_bios, &pkt->orig_bios_tail);
+		pkt->write_size += bio->bi_size / CD_FRAMESIZE;
+		spin_unlock(&pkt->lock);
+	}
+	spin_unlock(&pd->lock);
+
+	pkt->sleep_time = max(PACKET_WAIT_TIME, 1);
+	pkt_set_state(pkt, PACKET_WAITING_STATE);
+	atomic_set(&pkt->run_sm, 1);
+
+	spin_lock(&pd->cdrw.active_list_lock);
+	list_add(&pkt->list, &pd->cdrw.pkt_active_list);
+	spin_unlock(&pd->cdrw.active_list_lock);
+
+	return 1;
+}
+
+/*
+ * Assemble a bio to write one packet and queue the bio for processing
+ * by the underlying block device.
+ */
+static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
+{
+	struct bio *bio;
+	struct page *pages[PACKET_MAX_SIZE];
+	int offsets[PACKET_MAX_SIZE];
+	int f;
+	int frames_write;
+
+	for (f = 0; f < pkt->frames; f++) {
+		pages[f] = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE];
+		offsets[f] = (f * CD_FRAMESIZE) % PAGE_SIZE;
+	}
+
+	/*
+	 * Fill-in pages[] and offsets[] with data from orig_bios.
+	 */
+	frames_write = 0;
+	spin_lock(&pkt->lock);
+	for (bio = pkt->orig_bios; bio; bio = bio->bi_next) {
+		int segment = bio->bi_idx;
+		int src_offs = 0;
+		int first_frame = (bio->bi_sector - pkt->sector) / (CD_FRAMESIZE >> 9);
+		int num_frames = bio->bi_size / CD_FRAMESIZE;
+		BUG_ON(first_frame < 0);
+		BUG_ON(first_frame + num_frames > pkt->frames);
+		for (f = first_frame; f < first_frame + num_frames; f++) {
+			struct bio_vec *src_bvl = bio_iovec_idx(bio, segment);
+
+			while (src_offs >= src_bvl->bv_len) {
+				src_offs -= src_bvl->bv_len;
+				segment++;
+				BUG_ON(segment >= bio->bi_vcnt);
+				src_bvl = bio_iovec_idx(bio, segment);
+			}
+
+			if (src_bvl->bv_len - src_offs >= CD_FRAMESIZE) {
+				pages[f] = src_bvl->bv_page;
+				offsets[f] = src_bvl->bv_offset + src_offs;
+			} else {
+				pkt_copy_bio_data(bio, segment, src_offs,
+						  pages[f], offsets[f]);
+			}
+			src_offs += CD_FRAMESIZE;
+			frames_write++;
+		}
+	}
+	pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE);
+	spin_unlock(&pkt->lock);
+
+	VPRINTK("pkt_start_write: Writing %d frames for zone %llx\n",
+		frames_write, (unsigned long long)pkt->sector);
+	BUG_ON(frames_write != pkt->write_size);
+
+	if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) {
+		pkt_make_local_copy(pkt, pages, offsets);
+		pkt->cache_valid = 1;
+	} else {
+		pkt->cache_valid = 0;
+	}
+
+	/* Start the write request */
+	bio_init(pkt->w_bio);
+	pkt->w_bio->bi_max_vecs = PACKET_MAX_SIZE;
+	pkt->w_bio->bi_sector = pkt->sector;
+	pkt->w_bio->bi_bdev = pd->bdev;
+	pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
+	pkt->w_bio->bi_private = pkt;
+	for (f = 0; f < pkt->frames; f++) {
+		if ((f + 1 < pkt->frames) && (pages[f + 1] == pages[f]) &&
+		    (offsets[f + 1] = offsets[f] + CD_FRAMESIZE)) {
+			if (!bio_add_page(pkt->w_bio, pages[f], CD_FRAMESIZE * 2, offsets[f]))
+				BUG();
+			f++;
+		} else {
+			if (!bio_add_page(pkt->w_bio, pages[f], CD_FRAMESIZE, offsets[f]))
+				BUG();
+		}
+	}
+	VPRINTK("pktcdvd: vcnt=%d\n", pkt->w_bio->bi_vcnt);
+
+	atomic_set(&pkt->io_wait, 1);
+	pkt->w_bio->bi_rw = WRITE;
+	pkt_queue_bio(pd, pkt->w_bio, 0);
+}
+
+static void pkt_finish_packet(struct packet_data *pkt, int uptodate)
+{
+	struct bio *bio, *next;
+
+	if (!uptodate)
+		pkt->cache_valid = 0;
+
+	/* Finish all bios corresponding to this packet */
+	bio = pkt->orig_bios;
+	while (bio) {
+		next = bio->bi_next;
+		bio->bi_next = NULL;
+		bio_endio(bio, bio->bi_size, uptodate ? 0 : -EIO);
+		bio = next;
+	}
+	pkt->orig_bios = pkt->orig_bios_tail = NULL;
+}
+
+static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data *pkt)
+{
+	int uptodate;
+
+	VPRINTK("run_state_machine: pkt %d\n", pkt->id);
+
+	for (;;) {
+		switch (pkt->state) {
+		case PACKET_WAITING_STATE:
+			if ((pkt->write_size < pkt->frames) && (pkt->sleep_time > 0))
+				return;
+
+			pkt->sleep_time = 0;
+			pkt_gather_data(pd, pkt);
+			pkt_set_state(pkt, PACKET_READ_WAIT_STATE);
+			break;
+
+		case PACKET_READ_WAIT_STATE:
+			if (atomic_read(&pkt->io_wait) > 0)
+				return;
+
+			if (atomic_read(&pkt->io_errors) > 0) {
+				pkt_set_state(pkt, PACKET_RECOVERY_STATE);
+			} else {
+				pkt_start_write(pd, pkt);
+			}
+			break;
+
+		case PACKET_WRITE_WAIT_STATE:
+			if (atomic_read(&pkt->io_wait) > 0)
+				return;
+
+			if (test_bit(BIO_UPTODATE, &pkt->w_bio->bi_flags)) {
+				pkt_set_state(pkt, PACKET_FINISHED_STATE);
+			} else {
+				pkt_set_state(pkt, PACKET_RECOVERY_STATE);
+			}
+			break;
+
+		case PACKET_RECOVERY_STATE:
+			if (pkt_start_recovery(pkt)) {
+				pkt_start_write(pd, pkt);
+			} else {
+				VPRINTK("No recovery possible\n");
+				pkt_set_state(pkt, PACKET_FINISHED_STATE);
+			}
+			break;
+
+		case PACKET_FINISHED_STATE:
+			uptodate = test_bit(BIO_UPTODATE, &pkt->w_bio->bi_flags);
+			pkt_finish_packet(pkt, uptodate);
+			return;
+
+		default:
+			BUG();
+			break;
+		}
+	}
+}
+
+static void pkt_handle_packets(struct pktcdvd_device *pd)
+{
+	struct packet_data *pkt, *next;
+
+	VPRINTK("pkt_handle_packets\n");
+
+	/*
+	 * Run state machine for active packets
+	 */
+	list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+		if (atomic_read(&pkt->run_sm) > 0) {
+			atomic_set(&pkt->run_sm, 0);
+			pkt_run_state_machine(pd, pkt);
+		}
+	}
+
+	/*
+	 * Move no longer active packets to the free list
+	 */
+	spin_lock(&pd->cdrw.active_list_lock);
+	list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_active_list, list) {
+		if (pkt->state == PACKET_FINISHED_STATE) {
+			list_del(&pkt->list);
+			pkt_put_packet_data(pd, pkt);
+			pkt_set_state(pkt, PACKET_IDLE_STATE);
+			atomic_set(&pd->scan_queue, 1);
+		}
+	}
+	spin_unlock(&pd->cdrw.active_list_lock);
+}
+
+static void pkt_count_states(struct pktcdvd_device *pd, int *states)
+{
+	struct packet_data *pkt;
+	int i;
+
+	for (i = 0; i <= PACKET_NUM_STATES; i++)
+		states[i] = 0;
+
+	spin_lock(&pd->cdrw.active_list_lock);
+	list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+		states[pkt->state]++;
+	}
+	spin_unlock(&pd->cdrw.active_list_lock);
+}
+
+/*
+ * kcdrwd is woken up when writes have been queued for one of our
+ * registered devices
+ */
+static int kcdrwd(void *foobar)
+{
+	struct pktcdvd_device *pd = foobar;
+	struct packet_data *pkt;
+	long min_sleep_time, residue;
+
+	set_user_nice(current, -20);
+
+	for (;;) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		/*
+		 * Wait until there is something to do
+		 */
+		add_wait_queue(&pd->wqueue, &wait);
+		for (;;) {
+			set_current_state(TASK_INTERRUPTIBLE);
+
+			/* Check if we need to run pkt_handle_queue */
+			if (atomic_read(&pd->scan_queue) > 0)
+				goto work_to_do;
+
+			/* Check if we need to run the state machine for some packet */
+			list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+				if (atomic_read(&pkt->run_sm) > 0)
+					goto work_to_do;
+			}
+
+			/* Check if we need to process the iosched queues */
+			if (atomic_read(&pd->iosched.attention) != 0)
+				goto work_to_do;
+
+			/* Otherwise, go to sleep */
+			if (PACKET_DEBUG > 1) {
+				int states[PACKET_NUM_STATES];
+				pkt_count_states(pd, states);
+				VPRINTK("kcdrwd: i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n",
+					states[0], states[1], states[2], states[3],
+					states[4], states[5]);
+			}
+
+			min_sleep_time = MAX_SCHEDULE_TIMEOUT;
+			list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+				if (pkt->sleep_time && pkt->sleep_time < min_sleep_time)
+					min_sleep_time = pkt->sleep_time;
+			}
+
+			generic_unplug_device(bdev_get_queue(pd->bdev));
+
+			VPRINTK("kcdrwd: sleeping\n");
+			residue = schedule_timeout(min_sleep_time);
+			VPRINTK("kcdrwd: wake up\n");
+
+			/* make swsusp happy with our thread */
+			if (current->flags & PF_FREEZE)
+				refrigerator(PF_FREEZE);
+
+			list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+				if (!pkt->sleep_time)
+					continue;
+				pkt->sleep_time -= min_sleep_time - residue;
+				if (pkt->sleep_time <= 0) {
+					pkt->sleep_time = 0;
+					atomic_inc(&pkt->run_sm);
+				}
+			}
+
+			if (signal_pending(current)) {
+				flush_signals(current);
+			}
+			if (kthread_should_stop())
+				break;
+		}
+work_to_do:
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&pd->wqueue, &wait);
+
+		if (kthread_should_stop())
+			break;
+
+		/*
+		 * if pkt_handle_queue returns true, we can queue
+		 * another request.
+		 */
+		while (pkt_handle_queue(pd))
+			;
+
+		/*
+		 * Handle packet state machine
+		 */
+		pkt_handle_packets(pd);
+
+		/*
+		 * Handle iosched queues
+		 */
+		pkt_iosched_process_queue(pd);
+	}
+
+	return 0;
+}
+
+static void pkt_print_settings(struct pktcdvd_device *pd)
+{
+	printk("pktcdvd: %s packets, ", pd->settings.fp ? "Fixed" : "Variable");
+	printk("%u blocks, ", pd->settings.size >> 2);
+	printk("Mode-%c disc\n", pd->settings.block_mode == 8 ? '1' : '2');
+}
+
+static int pkt_mode_sense(struct pktcdvd_device *pd, struct packet_command *cgc,
+			  int page_code, int page_control)
+{
+	memset(cgc->cmd, 0, sizeof(cgc->cmd));
+
+	cgc->cmd[0] = GPCMD_MODE_SENSE_10;
+	cgc->cmd[2] = page_code | (page_control << 6);
+	cgc->cmd[7] = cgc->buflen >> 8;
+	cgc->cmd[8] = cgc->buflen & 0xff;
+	cgc->data_direction = CGC_DATA_READ;
+	return pkt_generic_packet(pd, cgc);
+}
+
+static int pkt_mode_select(struct pktcdvd_device *pd, struct packet_command *cgc)
+{
+	memset(cgc->cmd, 0, sizeof(cgc->cmd));
+	memset(cgc->buffer, 0, 2);
+	cgc->cmd[0] = GPCMD_MODE_SELECT_10;
+	cgc->cmd[1] = 0x10;		/* PF */
+	cgc->cmd[7] = cgc->buflen >> 8;
+	cgc->cmd[8] = cgc->buflen & 0xff;
+	cgc->data_direction = CGC_DATA_WRITE;
+	return pkt_generic_packet(pd, cgc);
+}
+
+static int pkt_get_disc_info(struct pktcdvd_device *pd, disc_information *di)
+{
+	struct packet_command cgc;
+	int ret;
+
+	/* set up command and get the disc info */
+	init_cdrom_command(&cgc, di, sizeof(*di), CGC_DATA_READ);
+	cgc.cmd[0] = GPCMD_READ_DISC_INFO;
+	cgc.cmd[8] = cgc.buflen = 2;
+	cgc.quiet = 1;
+
+	if ((ret = pkt_generic_packet(pd, &cgc)))
+		return ret;
+
+	/* not all drives have the same disc_info length, so requeue
+	 * packet with the length the drive tells us it can supply
+	 */
+	cgc.buflen = be16_to_cpu(di->disc_information_length) +
+		     sizeof(di->disc_information_length);
+
+	if (cgc.buflen > sizeof(disc_information))
+		cgc.buflen = sizeof(disc_information);
+
+	cgc.cmd[8] = cgc.buflen;
+	return pkt_generic_packet(pd, &cgc);
+}
+
+static int pkt_get_track_info(struct pktcdvd_device *pd, __u16 track, __u8 type, track_information *ti)
+{
+	struct packet_command cgc;
+	int ret;
+
+	init_cdrom_command(&cgc, ti, 8, CGC_DATA_READ);
+	cgc.cmd[0] = GPCMD_READ_TRACK_RZONE_INFO;
+	cgc.cmd[1] = type & 3;
+	cgc.cmd[4] = (track & 0xff00) >> 8;
+	cgc.cmd[5] = track & 0xff;
+	cgc.cmd[8] = 8;
+	cgc.quiet = 1;
+
+	if ((ret = pkt_generic_packet(pd, &cgc)))
+		return ret;
+
+	cgc.buflen = be16_to_cpu(ti->track_information_length) +
+		     sizeof(ti->track_information_length);
+
+	if (cgc.buflen > sizeof(track_information))
+		cgc.buflen = sizeof(track_information);
+
+	cgc.cmd[8] = cgc.buflen;
+	return pkt_generic_packet(pd, &cgc);
+}
+
+static int pkt_get_last_written(struct pktcdvd_device *pd, long *last_written)
+{
+	disc_information di;
+	track_information ti;
+	__u32 last_track;
+	int ret = -1;
+
+	if ((ret = pkt_get_disc_info(pd, &di)))
+		return ret;
+
+	last_track = (di.last_track_msb << 8) | di.last_track_lsb;
+	if ((ret = pkt_get_track_info(pd, last_track, 1, &ti)))
+		return ret;
+
+	/* if this track is blank, try the previous. */
+	if (ti.blank) {
+		last_track--;
+		if ((ret = pkt_get_track_info(pd, last_track, 1, &ti)))
+			return ret;
+	}
+
+	/* if last recorded field is valid, return it. */
+	if (ti.lra_v) {
+		*last_written = be32_to_cpu(ti.last_rec_address);
+	} else {
+		/* make it up instead */
+		*last_written = be32_to_cpu(ti.track_start) +
+				be32_to_cpu(ti.track_size);
+		if (ti.free_blocks)
+			*last_written -= (be32_to_cpu(ti.free_blocks) + 7);
+	}
+	return 0;
+}
+
+/*
+ * write mode select package based on pd->settings
+ */
+static int pkt_set_write_settings(struct pktcdvd_device *pd)
+{
+	struct packet_command cgc;
+	struct request_sense sense;
+	write_param_page *wp;
+	char buffer[128];
+	int ret, size;
+
+	/* doesn't apply to DVD+RW */
+	if (pd->mmc3_profile == 0x1a)
+		return 0;
+
+	memset(buffer, 0, sizeof(buffer));
+	init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ);
+	cgc.sense = &sense;
+	if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) {
+		pkt_dump_sense(&cgc);
+		return ret;
+	}
+
+	size = 2 + ((buffer[0] << 8) | (buffer[1] & 0xff));
+	pd->mode_offset = (buffer[6] << 8) | (buffer[7] & 0xff);
+	if (size > sizeof(buffer))
+		size = sizeof(buffer);
+
+	/*
+	 * now get it all
+	 */
+	init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ);
+	cgc.sense = &sense;
+	if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) {
+		pkt_dump_sense(&cgc);
+		return ret;
+	}
+
+	/*
+	 * write page is offset header + block descriptor length
+	 */
+	wp = (write_param_page *) &buffer[sizeof(struct mode_page_header) + pd->mode_offset];
+
+	wp->fp = pd->settings.fp;
+	wp->track_mode = pd->settings.track_mode;
+	wp->write_type = pd->settings.write_type;
+	wp->data_block_type = pd->settings.block_mode;
+
+	wp->multi_session = 0;
+
+#ifdef PACKET_USE_LS
+	wp->link_size = 7;
+	wp->ls_v = 1;
+#endif
+
+	if (wp->data_block_type == PACKET_BLOCK_MODE1) {
+		wp->session_format = 0;
+		wp->subhdr2 = 0x20;
+	} else if (wp->data_block_type == PACKET_BLOCK_MODE2) {
+		wp->session_format = 0x20;
+		wp->subhdr2 = 8;
+#if 0
+		wp->mcn[0] = 0x80;
+		memcpy(&wp->mcn[1], PACKET_MCN, sizeof(wp->mcn) - 1);
+#endif
+	} else {
+		/*
+		 * paranoia
+		 */
+		printk("pktcdvd: write mode wrong %d\n", wp->data_block_type);
+		return 1;
+	}
+	wp->packet_size = cpu_to_be32(pd->settings.size >> 2);
+
+	cgc.buflen = cgc.cmd[8] = size;
+	if ((ret = pkt_mode_select(pd, &cgc))) {
+		pkt_dump_sense(&cgc);
+		return ret;
+	}
+
+	pkt_print_settings(pd);
+	return 0;
+}
+
+/*
+ * 0 -- we can write to this track, 1 -- we can't
+ */
+static int pkt_good_track(track_information *ti)
+{
+	/*
+	 * only good for CD-RW at the moment, not DVD-RW
+	 */
+
+	/*
+	 * FIXME: only for FP
+	 */
+	if (ti->fp == 0)
+		return 0;
+
+	/*
+	 * "good" settings as per Mt Fuji.
+	 */
+	if (ti->rt == 0 && ti->blank == 0 && ti->packet == 1)
+		return 0;
+
+	if (ti->rt == 0 && ti->blank == 1 && ti->packet == 1)
+		return 0;
+
+	if (ti->rt == 1 && ti->blank == 0 && ti->packet == 1)
+		return 0;
+
+	printk("pktcdvd: bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet);
+	return 1;
+}
+
+/*
+ * 0 -- we can write to this disc, 1 -- we can't
+ */
+static int pkt_good_disc(struct pktcdvd_device *pd, disc_information *di)
+{
+	switch (pd->mmc3_profile) {
+		case 0x0a: /* CD-RW */
+		case 0xffff: /* MMC3 not supported */
+			break;
+		case 0x1a: /* DVD+RW */
+		case 0x13: /* DVD-RW */
+			return 0;
+		default:
+			printk("pktcdvd: Wrong disc profile (%x)\n", pd->mmc3_profile);
+			return 1;
+	}
+
+	/*
+	 * for disc type 0xff we should probably reserve a new track.
+	 * but i'm not sure, should we leave this to user apps? probably.
+	 */
+	if (di->disc_type == 0xff) {
+		printk("pktcdvd: Unknown disc. No track?\n");
+		return 1;
+	}
+
+	if (di->disc_type != 0x20 && di->disc_type != 0) {
+		printk("pktcdvd: Wrong disc type (%x)\n", di->disc_type);
+		return 1;
+	}
+
+	if (di->erasable == 0) {
+		printk("pktcdvd: Disc not erasable\n");
+		return 1;
+	}
+
+	if (di->border_status == PACKET_SESSION_RESERVED) {
+		printk("pktcdvd: Can't write to last track (reserved)\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+static int pkt_probe_settings(struct pktcdvd_device *pd)
+{
+	struct packet_command cgc;
+	unsigned char buf[12];
+	disc_information di;
+	track_information ti;
+	int ret, track;
+
+	init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
+	cgc.cmd[0] = GPCMD_GET_CONFIGURATION;
+	cgc.cmd[8] = 8;
+	ret = pkt_generic_packet(pd, &cgc);
+	pd->mmc3_profile = ret ? 0xffff : buf[6] << 8 | buf[7];
+
+	memset(&di, 0, sizeof(disc_information));
+	memset(&ti, 0, sizeof(track_information));
+
+	if ((ret = pkt_get_disc_info(pd, &di))) {
+		printk("failed get_disc\n");
+		return ret;
+	}
+
+	if (pkt_good_disc(pd, &di))
+		return -ENXIO;
+
+	switch (pd->mmc3_profile) {
+		case 0x1a: /* DVD+RW */
+			printk("pktcdvd: inserted media is DVD+RW\n");
+			break;
+		case 0x13: /* DVD-RW */
+			printk("pktcdvd: inserted media is DVD-RW\n");
+			break;
+		default:
+			printk("pktcdvd: inserted media is CD-R%s\n", di.erasable ? "W" : "");
+			break;
+	}
+	pd->type = di.erasable ? PACKET_CDRW : PACKET_CDR;
+
+	track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */
+	if ((ret = pkt_get_track_info(pd, track, 1, &ti))) {
+		printk("pktcdvd: failed get_track\n");
+		return ret;
+	}
+
+	if (pkt_good_track(&ti)) {
+		printk("pktcdvd: can't write to this track\n");
+		return -ENXIO;
+	}
+
+	/*
+	 * we keep packet size in 512 byte units, makes it easier to
+	 * deal with request calculations.
+	 */
+	pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2;
+	if (pd->settings.size == 0) {
+		printk("pktcdvd: detected zero packet size!\n");
+		pd->settings.size = 128;
+	}
+	pd->settings.fp = ti.fp;
+	pd->offset = (be32_to_cpu(ti.track_start) << 2) & (pd->settings.size - 1);
+
+	if (ti.nwa_v) {
+		pd->nwa = be32_to_cpu(ti.next_writable);
+		set_bit(PACKET_NWA_VALID, &pd->flags);
+	}
+
+	/*
+	 * in theory we could use lra on -RW media as well and just zero
+	 * blocks that haven't been written yet, but in practice that
+	 * is just a no-go. we'll use that for -R, naturally.
+	 */
+	if (ti.lra_v) {
+		pd->lra = be32_to_cpu(ti.last_rec_address);
+		set_bit(PACKET_LRA_VALID, &pd->flags);
+	} else {
+		pd->lra = 0xffffffff;
+		set_bit(PACKET_LRA_VALID, &pd->flags);
+	}
+
+	/*
+	 * fine for now
+	 */
+	pd->settings.link_loss = 7;
+	pd->settings.write_type = 0;	/* packet */
+	pd->settings.track_mode = ti.track_mode;
+
+	/*
+	 * mode1 or mode2 disc
+	 */
+	switch (ti.data_mode) {
+		case PACKET_MODE1:
+			pd->settings.block_mode = PACKET_BLOCK_MODE1;
+			break;
+		case PACKET_MODE2:
+			pd->settings.block_mode = PACKET_BLOCK_MODE2;
+			break;
+		default:
+			printk("pktcdvd: unknown data mode\n");
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * enable/disable write caching on drive
+ */
+static int pkt_write_caching(struct pktcdvd_device *pd, int set)
+{
+	struct packet_command cgc;
+	struct request_sense sense;
+	unsigned char buf[64];
+	int ret;
+
+	memset(buf, 0, sizeof(buf));
+	init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
+	cgc.sense = &sense;
+	cgc.buflen = pd->mode_offset + 12;
+
+	/*
+	 * caching mode page might not be there, so quiet this command
+	 */
+	cgc.quiet = 1;
+
+	if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0)))
+		return ret;
+
+	buf[pd->mode_offset + 10] |= (!!set << 2);
+
+	cgc.buflen = cgc.cmd[8] = 2 + ((buf[0] << 8) | (buf[1] & 0xff));
+	ret = pkt_mode_select(pd, &cgc);
+	if (ret) {
+		printk("pktcdvd: write caching control failed\n");
+		pkt_dump_sense(&cgc);
+	} else if (!ret && set)
+		printk("pktcdvd: enabled write caching on %s\n", pd->name);
+	return ret;
+}
+
+static int pkt_lock_door(struct pktcdvd_device *pd, int lockflag)
+{
+	struct packet_command cgc;
+
+	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
+	cgc.cmd[0] = GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL;
+	cgc.cmd[4] = lockflag ? 1 : 0;
+	return pkt_generic_packet(pd, &cgc);
+}
+
+/*
+ * Returns drive maximum write speed
+ */
+static int pkt_get_max_speed(struct pktcdvd_device *pd, unsigned *write_speed)
+{
+	struct packet_command cgc;
+	struct request_sense sense;
+	unsigned char buf[256+18];
+	unsigned char *cap_buf;
+	int ret, offset;
+
+	memset(buf, 0, sizeof(buf));
+	cap_buf = &buf[sizeof(struct mode_page_header) + pd->mode_offset];
+	init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_UNKNOWN);
+	cgc.sense = &sense;
+
+	ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
+	if (ret) {
+		cgc.buflen = pd->mode_offset + cap_buf[1] + 2 +
+			     sizeof(struct mode_page_header);
+		ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
+		if (ret) {
+			pkt_dump_sense(&cgc);
+			return ret;
+		}
+	}
+
+	offset = 20;			    /* Obsoleted field, used by older drives */
+	if (cap_buf[1] >= 28)
+		offset = 28;		    /* Current write speed selected */
+	if (cap_buf[1] >= 30) {
+		/* If the drive reports at least one "Logical Unit Write
+		 * Speed Performance Descriptor Block", use the information
+		 * in the first block. (contains the highest speed)
+		 */
+		int num_spdb = (cap_buf[30] << 8) + cap_buf[31];
+		if (num_spdb > 0)
+			offset = 34;
+	}
+
+	*write_speed = (cap_buf[offset] << 8) | cap_buf[offset + 1];
+	return 0;
+}
+
+/* These tables from cdrecord - I don't have orange book */
+/* standard speed CD-RW (1-4x) */
+static char clv_to_speed[16] = {
+	/* 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 */
+	   0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+/* high speed CD-RW (-10x) */
+static char hs_clv_to_speed[16] = {
+	/* 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 */
+	   0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+/* ultra high speed CD-RW */
+static char us_clv_to_speed[16] = {
+	/* 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 */
+	   0, 2, 4, 8, 0, 0,16, 0,24,32,40,48, 0, 0, 0, 0
+};
+
+/*
+ * reads the maximum media speed from ATIP
+ */
+static int pkt_media_speed(struct pktcdvd_device *pd, unsigned *speed)
+{
+	struct packet_command cgc;
+	struct request_sense sense;
+	unsigned char buf[64];
+	unsigned int size, st, sp;
+	int ret;
+
+	init_cdrom_command(&cgc, buf, 2, CGC_DATA_READ);
+	cgc.sense = &sense;
+	cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
+	cgc.cmd[1] = 2;
+	cgc.cmd[2] = 4; /* READ ATIP */
+	cgc.cmd[8] = 2;
+	ret = pkt_generic_packet(pd, &cgc);
+	if (ret) {
+		pkt_dump_sense(&cgc);
+		return ret;
+	}
+	size = ((unsigned int) buf[0]<<8) + buf[1] + 2;
+	if (size > sizeof(buf))
+		size = sizeof(buf);
+
+	init_cdrom_command(&cgc, buf, size, CGC_DATA_READ);
+	cgc.sense = &sense;
+	cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
+	cgc.cmd[1] = 2;
+	cgc.cmd[2] = 4;
+	cgc.cmd[8] = size;
+	ret = pkt_generic_packet(pd, &cgc);
+	if (ret) {
+		pkt_dump_sense(&cgc);
+		return ret;
+	}
+
+	if (!buf[6] & 0x40) {
+		printk("pktcdvd: Disc type is not CD-RW\n");
+		return 1;
+	}
+	if (!buf[6] & 0x4) {
+		printk("pktcdvd: A1 values on media are not valid, maybe not CDRW?\n");
+		return 1;
+	}
+
+	st = (buf[6] >> 3) & 0x7; /* disc sub-type */
+
+	sp = buf[16] & 0xf; /* max speed from ATIP A1 field */
+
+	/* Info from cdrecord */
+	switch (st) {
+		case 0: /* standard speed */
+			*speed = clv_to_speed[sp];
+			break;
+		case 1: /* high speed */
+			*speed = hs_clv_to_speed[sp];
+			break;
+		case 2: /* ultra high speed */
+			*speed = us_clv_to_speed[sp];
+			break;
+		default:
+			printk("pktcdvd: Unknown disc sub-type %d\n",st);
+			return 1;
+	}
+	if (*speed) {
+		printk("pktcdvd: Max. media speed: %d\n",*speed);
+		return 0;
+	} else {
+		printk("pktcdvd: Unknown speed %d for sub-type %d\n",sp,st);
+		return 1;
+	}
+}
+
+static int pkt_perform_opc(struct pktcdvd_device *pd)
+{
+	struct packet_command cgc;
+	struct request_sense sense;
+	int ret;
+
+	VPRINTK("pktcdvd: Performing OPC\n");
+
+	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
+	cgc.sense = &sense;
+	cgc.timeout = 60*HZ;
+	cgc.cmd[0] = GPCMD_SEND_OPC;
+	cgc.cmd[1] = 1;
+	if ((ret = pkt_generic_packet(pd, &cgc)))
+		pkt_dump_sense(&cgc);
+	return ret;
+}
+
+static int pkt_open_write(struct pktcdvd_device *pd)
+{
+	int ret;
+	unsigned int write_speed, media_write_speed, read_speed;
+
+	if ((ret = pkt_probe_settings(pd))) {
+		DPRINTK("pktcdvd: %s failed probe\n", pd->name);
+		return -EIO;
+	}
+
+	if ((ret = pkt_set_write_settings(pd))) {
+		DPRINTK("pktcdvd: %s failed saving write settings\n", pd->name);
+		return -EIO;
+	}
+
+	pkt_write_caching(pd, USE_WCACHING);
+
+	if ((ret = pkt_get_max_speed(pd, &write_speed)))
+		write_speed = 16 * 177;
+	switch (pd->mmc3_profile) {
+		case 0x13: /* DVD-RW */
+		case 0x1a: /* DVD+RW */
+			DPRINTK("pktcdvd: write speed %ukB/s\n", write_speed);
+			break;
+		default:
+			if ((ret = pkt_media_speed(pd, &media_write_speed)))
+				media_write_speed = 16;
+			write_speed = min(write_speed, media_write_speed * 177);
+			DPRINTK("pktcdvd: write speed %ux\n", write_speed / 176);
+			break;
+	}
+	read_speed = write_speed;
+
+	if ((ret = pkt_set_speed(pd, write_speed, read_speed))) {
+		DPRINTK("pktcdvd: %s couldn't set write speed\n", pd->name);
+		return -EIO;
+	}
+	pd->write_speed = write_speed;
+	pd->read_speed = read_speed;
+
+	if ((ret = pkt_perform_opc(pd))) {
+		DPRINTK("pktcdvd: %s Optimum Power Calibration failed\n", pd->name);
+	}
+
+	return 0;
+}
+
+/*
+ * called at open time.
+ */
+static int pkt_open_dev(struct pktcdvd_device *pd, int write)
+{
+	int ret;
+	long lba;
+	request_queue_t *q;
+
+	/*
+	 * We need to re-open the cdrom device without O_NONBLOCK to be able
+	 * to read/write from/to it. It is already opened in O_NONBLOCK mode
+	 * so bdget() can't fail.
+	 */
+	bdget(pd->bdev->bd_dev);
+	if ((ret = blkdev_get(pd->bdev, FMODE_READ, O_RDONLY)))
+		goto out;
+
+	if ((ret = pkt_get_last_written(pd, &lba))) {
+		printk("pktcdvd: pkt_get_last_written failed\n");
+		goto out_putdev;
+	}
+
+	set_capacity(pd->disk, lba << 2);
+	set_capacity(pd->bdev->bd_disk, lba << 2);
+	bd_set_size(pd->bdev, (loff_t)lba << 11);
+
+	q = bdev_get_queue(pd->bdev);
+	if (write) {
+		if ((ret = pkt_open_write(pd)))
+			goto out_putdev;
+		/*
+		 * Some CDRW drives can not handle writes larger than one packet,
+		 * even if the size is a multiple of the packet size.
+		 */
+		spin_lock_irq(q->queue_lock);
+		blk_queue_max_sectors(q, pd->settings.size);
+		spin_unlock_irq(q->queue_lock);
+		set_bit(PACKET_WRITABLE, &pd->flags);
+	} else {
+		pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
+		clear_bit(PACKET_WRITABLE, &pd->flags);
+	}
+
+	if ((ret = pkt_set_segment_merging(pd, q)))
+		goto out_putdev;
+
+	if (write)
+		printk("pktcdvd: %lukB available on disc\n", lba << 1);
+
+	return 0;
+
+out_putdev:
+	blkdev_put(pd->bdev);
+out:
+	return ret;
+}
+
+/*
+ * called when the device is closed. makes sure that the device flushes
+ * the internal cache before we close.
+ */
+static void pkt_release_dev(struct pktcdvd_device *pd, int flush)
+{
+	if (flush && pkt_flush_cache(pd))
+		DPRINTK("pktcdvd: %s not flushing cache\n", pd->name);
+
+	pkt_lock_door(pd, 0);
+
+	pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
+	blkdev_put(pd->bdev);
+}
+
+static struct pktcdvd_device *pkt_find_dev_from_minor(int dev_minor)
+{
+	if (dev_minor >= MAX_WRITERS)
+		return NULL;
+	return pkt_devs[dev_minor];
+}
+
+static int pkt_open(struct inode *inode, struct file *file)
+{
+	struct pktcdvd_device *pd = NULL;
+	int ret;
+
+	VPRINTK("pktcdvd: entering open\n");
+
+	down(&ctl_mutex);
+	pd = pkt_find_dev_from_minor(iminor(inode));
+	if (!pd) {
+		ret = -ENODEV;
+		goto out;
+	}
+	BUG_ON(pd->refcnt < 0);
+
+	pd->refcnt++;
+	if (pd->refcnt == 1) {
+		if (pkt_open_dev(pd, file->f_mode & FMODE_WRITE)) {
+			ret = -EIO;
+			goto out_dec;
+		}
+		/*
+		 * needed here as well, since ext2 (among others) may change
+		 * the blocksize at mount time
+		 */
+		set_blocksize(inode->i_bdev, CD_FRAMESIZE);
+	}
+
+	up(&ctl_mutex);
+	return 0;
+
+out_dec:
+	pd->refcnt--;
+out:
+	VPRINTK("pktcdvd: failed open (%d)\n", ret);
+	up(&ctl_mutex);
+	return ret;
+}
+
+static int pkt_close(struct inode *inode, struct file *file)
+{
+	struct pktcdvd_device *pd = inode->i_bdev->bd_disk->private_data;
+	int ret = 0;
+
+	down(&ctl_mutex);
+	pd->refcnt--;
+	BUG_ON(pd->refcnt < 0);
+	if (pd->refcnt == 0) {
+		int flush = test_bit(PACKET_WRITABLE, &pd->flags);
+		pkt_release_dev(pd, flush);
+	}
+	up(&ctl_mutex);
+	return ret;
+}
+
+
+static void *psd_pool_alloc(int gfp_mask, void *data)
+{
+	return kmalloc(sizeof(struct packet_stacked_data), gfp_mask);
+}
+
+static void psd_pool_free(void *ptr, void *data)
+{
+	kfree(ptr);
+}
+
+static int pkt_end_io_read_cloned(struct bio *bio, unsigned int bytes_done, int err)
+{
+	struct packet_stacked_data *psd = bio->bi_private;
+	struct pktcdvd_device *pd = psd->pd;
+
+	if (bio->bi_size)
+		return 1;
+
+	bio_put(bio);
+	bio_endio(psd->bio, psd->bio->bi_size, err);
+	mempool_free(psd, psd_pool);
+	pkt_bio_finished(pd);
+	return 0;
+}
+
+static int pkt_make_request(request_queue_t *q, struct bio *bio)
+{
+	struct pktcdvd_device *pd;
+	char b[BDEVNAME_SIZE];
+	sector_t zone;
+	struct packet_data *pkt;
+	int was_empty, blocked_bio;
+	struct pkt_rb_node *node;
+
+	pd = q->queuedata;
+	if (!pd) {
+		printk("pktcdvd: %s incorrect request queue\n", bdevname(bio->bi_bdev, b));
+		goto end_io;
+	}
+
+	/*
+	 * Clone READ bios so we can have our own bi_end_io callback.
+	 */
+	if (bio_data_dir(bio) == READ) {
+		struct bio *cloned_bio = bio_clone(bio, GFP_NOIO);
+		struct packet_stacked_data *psd = mempool_alloc(psd_pool, GFP_NOIO);
+
+		psd->pd = pd;
+		psd->bio = bio;
+		cloned_bio->bi_bdev = pd->bdev;
+		cloned_bio->bi_private = psd;
+		cloned_bio->bi_end_io = pkt_end_io_read_cloned;
+		pd->stats.secs_r += bio->bi_size >> 9;
+		pkt_queue_bio(pd, cloned_bio, 1);
+		return 0;
+	}
+
+	if (!test_bit(PACKET_WRITABLE, &pd->flags)) {
+		printk("pktcdvd: WRITE for ro device %s (%llu)\n",
+			pd->name, (unsigned long long)bio->bi_sector);
+		goto end_io;
+	}
+
+	if (!bio->bi_size || (bio->bi_size % CD_FRAMESIZE)) {
+		printk("pktcdvd: wrong bio size\n");
+		goto end_io;
+	}
+
+	blk_queue_bounce(q, &bio);
+
+	zone = ZONE(bio->bi_sector, pd);
+	VPRINTK("pkt_make_request: start = %6llx stop = %6llx\n",
+		(unsigned long long)bio->bi_sector,
+		(unsigned long long)(bio->bi_sector + bio_sectors(bio)));
+
+	/* Check if we have to split the bio */
+	{
+		struct bio_pair *bp;
+		sector_t last_zone;
+		int first_sectors;
+
+		last_zone = ZONE(bio->bi_sector + bio_sectors(bio) - 1, pd);
+		if (last_zone != zone) {
+			BUG_ON(last_zone != zone + pd->settings.size);
+			first_sectors = last_zone - bio->bi_sector;
+			bp = bio_split(bio, bio_split_pool, first_sectors);
+			BUG_ON(!bp);
+			pkt_make_request(q, &bp->bio1);
+			pkt_make_request(q, &bp->bio2);
+			bio_pair_release(bp);
+			return 0;
+		}
+	}
+
+	/*
+	 * If we find a matching packet in state WAITING or READ_WAIT, we can
+	 * just append this bio to that packet.
+	 */
+	spin_lock(&pd->cdrw.active_list_lock);
+	blocked_bio = 0;
+	list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+		if (pkt->sector == zone) {
+			spin_lock(&pkt->lock);
+			if ((pkt->state == PACKET_WAITING_STATE) ||
+			    (pkt->state == PACKET_READ_WAIT_STATE)) {
+				pkt_add_list_last(bio, &pkt->orig_bios,
+						  &pkt->orig_bios_tail);
+				pkt->write_size += bio->bi_size / CD_FRAMESIZE;
+				if ((pkt->write_size >= pkt->frames) &&
+				    (pkt->state == PACKET_WAITING_STATE)) {
+					atomic_inc(&pkt->run_sm);
+					wake_up(&pd->wqueue);
+				}
+				spin_unlock(&pkt->lock);
+				spin_unlock(&pd->cdrw.active_list_lock);
+				return 0;
+			} else {
+				blocked_bio = 1;
+			}
+			spin_unlock(&pkt->lock);
+		}
+	}
+	spin_unlock(&pd->cdrw.active_list_lock);
+
+	/*
+	 * No matching packet found. Store the bio in the work queue.
+	 */
+	node = mempool_alloc(pd->rb_pool, GFP_NOIO);
+	BUG_ON(!node);
+	node->bio = bio;
+	spin_lock(&pd->lock);
+	BUG_ON(pd->bio_queue_size < 0);
+	was_empty = (pd->bio_queue_size == 0);
+	pkt_rbtree_insert(pd, node);
+	spin_unlock(&pd->lock);
+
+	/*
+	 * Wake up the worker thread.
+	 */
+	atomic_set(&pd->scan_queue, 1);
+	if (was_empty) {
+		/* This wake_up is required for correct operation */
+		wake_up(&pd->wqueue);
+	} else if (!list_empty(&pd->cdrw.pkt_free_list) && !blocked_bio) {
+		/*
+		 * This wake up is not required for correct operation,
+		 * but improves performance in some cases.
+		 */
+		wake_up(&pd->wqueue);
+	}
+	return 0;
+end_io:
+	bio_io_error(bio, bio->bi_size);
+	return 0;
+}
+
+
+
+static int pkt_merge_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *bvec)
+{
+	struct pktcdvd_device *pd = q->queuedata;
+	sector_t zone = ZONE(bio->bi_sector, pd);
+	int used = ((bio->bi_sector - zone) << 9) + bio->bi_size;
+	int remaining = (pd->settings.size << 9) - used;
+	int remaining2;
+
+	/*
+	 * A bio <= PAGE_SIZE must be allowed. If it crosses a packet
+	 * boundary, pkt_make_request() will split the bio.
+	 */
+	remaining2 = PAGE_SIZE - bio->bi_size;
+	remaining = max(remaining, remaining2);
+
+	BUG_ON(remaining < 0);
+	return remaining;
+}
+
+static void pkt_init_queue(struct pktcdvd_device *pd)
+{
+	request_queue_t *q = pd->disk->queue;
+
+	blk_queue_make_request(q, pkt_make_request);
+	blk_queue_hardsect_size(q, CD_FRAMESIZE);
+	blk_queue_max_sectors(q, PACKET_MAX_SECTORS);
+	blk_queue_merge_bvec(q, pkt_merge_bvec);
+	q->queuedata = pd;
+}
+
+static int pkt_seq_show(struct seq_file *m, void *p)
+{
+	struct pktcdvd_device *pd = m->private;
+	char *msg;
+	char bdev_buf[BDEVNAME_SIZE];
+	int states[PACKET_NUM_STATES];
+
+	seq_printf(m, "Writer %s mapped to %s:\n", pd->name,
+		   bdevname(pd->bdev, bdev_buf));
+
+	seq_printf(m, "\nSettings:\n");
+	seq_printf(m, "\tpacket size:\t\t%dkB\n", pd->settings.size / 2);
+
+	if (pd->settings.write_type == 0)
+		msg = "Packet";
+	else
+		msg = "Unknown";
+	seq_printf(m, "\twrite type:\t\t%s\n", msg);
+
+	seq_printf(m, "\tpacket type:\t\t%s\n", pd->settings.fp ? "Fixed" : "Variable");
+	seq_printf(m, "\tlink loss:\t\t%d\n", pd->settings.link_loss);
+
+	seq_printf(m, "\ttrack mode:\t\t%d\n", pd->settings.track_mode);
+
+	if (pd->settings.block_mode == PACKET_BLOCK_MODE1)
+		msg = "Mode 1";
+	else if (pd->settings.block_mode == PACKET_BLOCK_MODE2)
+		msg = "Mode 2";
+	else
+		msg = "Unknown";
+	seq_printf(m, "\tblock mode:\t\t%s\n", msg);
+
+	seq_printf(m, "\nStatistics:\n");
+	seq_printf(m, "\tpackets started:\t%lu\n", pd->stats.pkt_started);
+	seq_printf(m, "\tpackets ended:\t\t%lu\n", pd->stats.pkt_ended);
+	seq_printf(m, "\twritten:\t\t%lukB\n", pd->stats.secs_w >> 1);
+	seq_printf(m, "\tread gather:\t\t%lukB\n", pd->stats.secs_rg >> 1);
+	seq_printf(m, "\tread:\t\t\t%lukB\n", pd->stats.secs_r >> 1);
+
+	seq_printf(m, "\nMisc:\n");
+	seq_printf(m, "\treference count:\t%d\n", pd->refcnt);
+	seq_printf(m, "\tflags:\t\t\t0x%lx\n", pd->flags);
+	seq_printf(m, "\tread speed:\t\t%ukB/s\n", pd->read_speed);
+	seq_printf(m, "\twrite speed:\t\t%ukB/s\n", pd->write_speed);
+	seq_printf(m, "\tstart offset:\t\t%lu\n", pd->offset);
+	seq_printf(m, "\tmode page offset:\t%u\n", pd->mode_offset);
+
+	seq_printf(m, "\nQueue state:\n");
+	seq_printf(m, "\tbios queued:\t\t%d\n", pd->bio_queue_size);
+	seq_printf(m, "\tbios pending:\t\t%d\n", atomic_read(&pd->cdrw.pending_bios));
+	seq_printf(m, "\tcurrent sector:\t\t0x%llx\n", (unsigned long long)pd->current_sector);
+
+	pkt_count_states(pd, states);
+	seq_printf(m, "\tstate:\t\t\ti:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n",
+		   states[0], states[1], states[2], states[3], states[4], states[5]);
+
+	return 0;
+}
+
+static int pkt_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, pkt_seq_show, PDE(inode)->data);
+}
+
+static struct file_operations pkt_proc_fops = {
+	.open	= pkt_seq_open,
+	.read	= seq_read,
+	.llseek	= seq_lseek,
+	.release = single_release
+};
+
+static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
+{
+	int i;
+	int ret = 0;
+	char b[BDEVNAME_SIZE];
+	struct proc_dir_entry *proc;
+	struct block_device *bdev;
+
+	if (pd->pkt_dev == dev) {
+		printk("pktcdvd: Recursive setup not allowed\n");
+		return -EBUSY;
+	}
+	for (i = 0; i < MAX_WRITERS; i++) {
+		struct pktcdvd_device *pd2 = pkt_devs[i];
+		if (!pd2)
+			continue;
+		if (pd2->bdev->bd_dev == dev) {
+			printk("pktcdvd: %s already setup\n", bdevname(pd2->bdev, b));
+			return -EBUSY;
+		}
+		if (pd2->pkt_dev == dev) {
+			printk("pktcdvd: Can't chain pktcdvd devices\n");
+			return -EBUSY;
+		}
+	}
+
+	bdev = bdget(dev);
+	if (!bdev)
+		return -ENOMEM;
+	ret = blkdev_get(bdev, FMODE_READ, O_RDONLY | O_NONBLOCK);
+	if (ret)
+		return ret;
+
+	/* This is safe, since we have a reference from open(). */
+	__module_get(THIS_MODULE);
+
+	if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) {
+		printk("pktcdvd: not enough memory for buffers\n");
+		ret = -ENOMEM;
+		goto out_mem;
+	}
+
+	pd->bdev = bdev;
+	set_blocksize(bdev, CD_FRAMESIZE);
+
+	pkt_init_queue(pd);
+
+	atomic_set(&pd->cdrw.pending_bios, 0);
+	pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->name);
+	if (IS_ERR(pd->cdrw.thread)) {
+		printk("pktcdvd: can't start kernel thread\n");
+		ret = -ENOMEM;
+		goto out_thread;
+	}
+
+	proc = create_proc_entry(pd->name, 0, pkt_proc);
+	if (proc) {
+		proc->data = pd;
+		proc->proc_fops = &pkt_proc_fops;
+	}
+	DPRINTK("pktcdvd: writer %s mapped to %s\n", pd->name, bdevname(bdev, b));
+	return 0;
+
+out_thread:
+	pkt_shrink_pktlist(pd);
+out_mem:
+	blkdev_put(bdev);
+	/* This is safe: open() is still holding a reference. */
+	module_put(THIS_MODULE);
+	return ret;
+}
+
+static int pkt_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct pktcdvd_device *pd = inode->i_bdev->bd_disk->private_data;
+
+	VPRINTK("pkt_ioctl: cmd %x, dev %d:%d\n", cmd, imajor(inode), iminor(inode));
+	BUG_ON(!pd);
+
+	switch (cmd) {
+	/*
+	 * forward selected CDROM ioctls to CD-ROM, for UDF
+	 */
+	case CDROMMULTISESSION:
+	case CDROMREADTOCENTRY:
+	case CDROM_LAST_WRITTEN:
+	case CDROM_SEND_PACKET:
+	case SCSI_IOCTL_SEND_COMMAND:
+		return ioctl_by_bdev(pd->bdev, cmd, arg);
+
+	case CDROMEJECT:
+		/*
+		 * The door gets locked when the device is opened, so we
+		 * have to unlock it or else the eject command fails.
+		 */
+		pkt_lock_door(pd, 0);
+		return ioctl_by_bdev(pd->bdev, cmd, arg);
+
+	default:
+		printk("pktcdvd: Unknown ioctl for %s (%x)\n", pd->name, cmd);
+		return -ENOTTY;
+	}
+
+	return 0;
+}
+
+static int pkt_media_changed(struct gendisk *disk)
+{
+	struct pktcdvd_device *pd = disk->private_data;
+	struct gendisk *attached_disk;
+
+	if (!pd)
+		return 0;
+	if (!pd->bdev)
+		return 0;
+	attached_disk = pd->bdev->bd_disk;
+	if (!attached_disk)
+		return 0;
+	return attached_disk->fops->media_changed(attached_disk);
+}
+
+static struct block_device_operations pktcdvd_ops = {
+	.owner =		THIS_MODULE,
+	.open =			pkt_open,
+	.release =		pkt_close,
+	.ioctl =		pkt_ioctl,
+	.media_changed =	pkt_media_changed,
+};
+
+/*
+ * Set up mapping from pktcdvd device to CD-ROM device.
+ */
+static int pkt_setup_dev(struct pkt_ctrl_command *ctrl_cmd)
+{
+	int idx;
+	int ret = -ENOMEM;
+	struct pktcdvd_device *pd;
+	struct gendisk *disk;
+	dev_t dev = new_decode_dev(ctrl_cmd->dev);
+
+	for (idx = 0; idx < MAX_WRITERS; idx++)
+		if (!pkt_devs[idx])
+			break;
+	if (idx == MAX_WRITERS) {
+		printk("pktcdvd: max %d writers supported\n", MAX_WRITERS);
+		return -EBUSY;
+	}
+
+	pd = kmalloc(sizeof(struct pktcdvd_device), GFP_KERNEL);
+	if (!pd)
+		return ret;
+	memset(pd, 0, sizeof(struct pktcdvd_device));
+
+	pd->rb_pool = mempool_create(PKT_RB_POOL_SIZE, pkt_rb_alloc, pkt_rb_free, NULL);
+	if (!pd->rb_pool)
+		goto out_mem;
+
+	disk = alloc_disk(1);
+	if (!disk)
+		goto out_mem;
+	pd->disk = disk;
+
+	spin_lock_init(&pd->lock);
+	spin_lock_init(&pd->iosched.lock);
+	sprintf(pd->name, "pktcdvd%d", idx);
+	init_waitqueue_head(&pd->wqueue);
+	pd->bio_queue = RB_ROOT;
+
+	disk->major = pkt_major;
+	disk->first_minor = idx;
+	disk->fops = &pktcdvd_ops;
+	disk->flags = GENHD_FL_REMOVABLE;
+	sprintf(disk->disk_name, "pktcdvd%d", idx);
+	disk->private_data = pd;
+	disk->queue = blk_alloc_queue(GFP_KERNEL);
+	if (!disk->queue)
+		goto out_mem2;
+
+	pd->pkt_dev = MKDEV(disk->major, disk->first_minor);
+	ret = pkt_new_dev(pd, dev);
+	if (ret)
+		goto out_new_dev;
+
+	add_disk(disk);
+	pkt_devs[idx] = pd;
+	ctrl_cmd->pkt_dev = new_encode_dev(pd->pkt_dev);
+	return 0;
+
+out_new_dev:
+	blk_put_queue(disk->queue);
+out_mem2:
+	put_disk(disk);
+out_mem:
+	if (pd->rb_pool)
+		mempool_destroy(pd->rb_pool);
+	kfree(pd);
+	return ret;
+}
+
+/*
+ * Tear down mapping from pktcdvd device to CD-ROM device.
+ */
+static int pkt_remove_dev(struct pkt_ctrl_command *ctrl_cmd)
+{
+	struct pktcdvd_device *pd;
+	int idx;
+	dev_t pkt_dev = new_decode_dev(ctrl_cmd->pkt_dev);
+
+	for (idx = 0; idx < MAX_WRITERS; idx++) {
+		pd = pkt_devs[idx];
+		if (pd && (pd->pkt_dev == pkt_dev))
+			break;
+	}
+	if (idx == MAX_WRITERS) {
+		DPRINTK("pktcdvd: dev not setup\n");
+		return -ENXIO;
+	}
+
+	if (pd->refcnt > 0)
+		return -EBUSY;
+
+	if (!IS_ERR(pd->cdrw.thread))
+		kthread_stop(pd->cdrw.thread);
+
+	blkdev_put(pd->bdev);
+
+	pkt_shrink_pktlist(pd);
+
+	remove_proc_entry(pd->name, pkt_proc);
+	DPRINTK("pktcdvd: writer %s unmapped\n", pd->name);
+
+	del_gendisk(pd->disk);
+	blk_put_queue(pd->disk->queue);
+	put_disk(pd->disk);
+
+	pkt_devs[idx] = NULL;
+	mempool_destroy(pd->rb_pool);
+	kfree(pd);
+
+	/* This is safe: open() is still holding a reference. */
+	module_put(THIS_MODULE);
+	return 0;
+}
+
+static void pkt_get_status(struct pkt_ctrl_command *ctrl_cmd)
+{
+	struct pktcdvd_device *pd = pkt_find_dev_from_minor(ctrl_cmd->dev_index);
+	if (pd) {
+		ctrl_cmd->dev = new_encode_dev(pd->bdev->bd_dev);
+		ctrl_cmd->pkt_dev = new_encode_dev(pd->pkt_dev);
+	} else {
+		ctrl_cmd->dev = 0;
+		ctrl_cmd->pkt_dev = 0;
+	}
+	ctrl_cmd->num_devices = MAX_WRITERS;
+}
+
+static int pkt_ctl_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	struct pkt_ctrl_command ctrl_cmd;
+	int ret = 0;
+
+	if (cmd != PACKET_CTRL_CMD)
+		return -ENOTTY;
+
+	if (copy_from_user(&ctrl_cmd, argp, sizeof(struct pkt_ctrl_command)))
+		return -EFAULT;
+
+	switch (ctrl_cmd.command) {
+	case PKT_CTRL_CMD_SETUP:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		down(&ctl_mutex);
+		ret = pkt_setup_dev(&ctrl_cmd);
+		up(&ctl_mutex);
+		break;
+	case PKT_CTRL_CMD_TEARDOWN:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		down(&ctl_mutex);
+		ret = pkt_remove_dev(&ctrl_cmd);
+		up(&ctl_mutex);
+		break;
+	case PKT_CTRL_CMD_STATUS:
+		down(&ctl_mutex);
+		pkt_get_status(&ctrl_cmd);
+		up(&ctl_mutex);
+		break;
+	default:
+		return -ENOTTY;
+	}
+
+	if (copy_to_user(argp, &ctrl_cmd, sizeof(struct pkt_ctrl_command)))
+		return -EFAULT;
+	return ret;
+}
+
+
+static struct file_operations pkt_ctl_fops = {
+	.ioctl	 = pkt_ctl_ioctl,
+	.owner	 = THIS_MODULE,
+};
+
+static struct miscdevice pkt_misc = {
+	.minor 		= MISC_DYNAMIC_MINOR,
+	.name  		= "pktcdvd",
+	.devfs_name 	= "pktcdvd/control",
+	.fops  		= &pkt_ctl_fops
+};
+
+int pkt_init(void)
+{
+	int ret;
+
+	psd_pool = mempool_create(PSD_POOL_SIZE, psd_pool_alloc, psd_pool_free, NULL);
+	if (!psd_pool)
+		return -ENOMEM;
+
+	ret = register_blkdev(pkt_major, "pktcdvd");
+	if (ret < 0) {
+		printk("pktcdvd: Unable to register block device\n");
+		goto out2;
+	}
+	if (!pkt_major)
+		pkt_major = ret;
+
+	ret = misc_register(&pkt_misc);
+	if (ret) {
+		printk("pktcdvd: Unable to register misc device\n");
+		goto out;
+	}
+
+	init_MUTEX(&ctl_mutex);
+
+	pkt_proc = proc_mkdir("pktcdvd", proc_root_driver);
+
+	DPRINTK("pktcdvd: %s\n", VERSION_CODE);
+	return 0;
+
+out:
+	unregister_blkdev(pkt_major, "pktcdvd");
+out2:
+	mempool_destroy(psd_pool);
+	return ret;
+}
+
+void pkt_exit(void)
+{
+	remove_proc_entry("pktcdvd", proc_root_driver);
+	misc_deregister(&pkt_misc);
+	unregister_blkdev(pkt_major, "pktcdvd");
+	mempool_destroy(psd_pool);
+}
+
+MODULE_DESCRIPTION("Packet writing layer for CD/DVD drives");
+MODULE_AUTHOR("Jens Axboe <axboe@suse.de>");
+MODULE_LICENSE("GPL");
+
+module_init(pkt_init);
+module_exit(pkt_exit);
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index f605535d3f56..dd3be0a06219 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -25,6 +25,7 @@
  *  -- prune comments, they are too volumnous
  *  -- Exterminate P3 printks
  *  -- Resove XXX's
+ *  -- Redo "benh's retries", perhaps have spin-up code to handle them. V:D=?
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -62,9 +63,9 @@
 
 /* command block wrapper */
 struct bulk_cb_wrap {
-	u32	Signature;		/* contains 'USBC' */
+	__le32	Signature;		/* contains 'USBC' */
 	u32	Tag;			/* unique per command id */
-	u32	DataTransferLength;	/* size of data */
+	__le32	DataTransferLength;	/* size of data */
 	u8	Flags;			/* direction in bit 0 */
 	u8	Lun;			/* LUN normally 0 */
 	u8	Length;			/* of of the CDB */
@@ -78,9 +79,9 @@ struct bulk_cb_wrap {
 
 /* command status wrapper */
 struct bulk_cs_wrap {
-	u32	Signature;		/* should = 'USBS' */
+	__le32	Signature;		/* should = 'USBS' */
 	u32	Tag;			/* same as original command */
-	u32	Residue;		/* amount not transferred */
+	__le32	Residue;		/* amount not transferred */
 	u8	Status;			/* see below */
 };
 
@@ -157,7 +158,8 @@ struct ub_scsi_cmd {
 	struct ub_scsi_cmd *next;
 
 	int error;			/* Return code - valid upon done */
-	int act_len;			/* Return size */
+	unsigned int act_len;		/* Return size */
+	unsigned char key, asc, ascq;	/* May be valid if error==-EIO */
 
 	int stat_count;			/* Retries getting status. */
 
@@ -490,6 +492,18 @@ static void ub_id_put(int id)
  */
 static void ub_cleanup(struct ub_dev *sc)
 {
+
+	/*
+	 * If we zero disk->private_data BEFORE put_disk, we have to check
+	 * for NULL all over the place in open, release, check_media and
+	 * revalidate, because the block level semaphore is well inside the
+	 * put_disk. But we cannot zero after the call, because *disk is gone.
+	 * The sd.c is blatantly racy in this area.
+	 */
+	/* disk->private_data = NULL; */
+	put_disk(sc->disk);
+	sc->disk = NULL;
+
 	ub_id_put(sc->id);
 	kfree(sc);
 }
@@ -661,9 +675,12 @@ static inline int ub_bd_rq_fn_1(request_queue_t *q)
 
 	/*
 	 * build the command
+	 *
+	 * The call to blk_queue_hardsect_size() guarantees that request
+	 * is aligned, but it is given in terms of 512 byte units, always.
 	 */
-	block = rq->sector;
-	nblks = rq->nr_sectors;
+	block = rq->sector >> sc->capacity.bshift;
+	nblks = rq->nr_sectors >> sc->capacity.bshift;
 
 	memset(cmd, 0, sizeof(struct ub_scsi_cmd));
 	cmd->cdb[0] = (ub_dir == UB_DIR_READ)? READ_10: WRITE_10;
@@ -678,7 +695,7 @@ static inline int ub_bd_rq_fn_1(request_queue_t *q)
 	cmd->dir = ub_dir;
 	cmd->state = UB_CMDST_INIT;
 	cmd->data = rq->buffer;
-	cmd->len = nblks * 512;
+	cmd->len = rq->nr_sectors * 512;
 	cmd->done = ub_rw_cmd_done;
 	cmd->back = rq;
 
@@ -786,17 +803,16 @@ static int ub_scsi_cmd_start(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 	sc->work_urb.error_count = 0;
 	sc->work_urb.status = 0;
 
-	sc->work_timer.expires = jiffies + UB_URB_TIMEOUT;
-	add_timer(&sc->work_timer);
-
 	if ((rc = usb_submit_urb(&sc->work_urb, GFP_ATOMIC)) != 0) {
 		/* XXX Clear stalls */
 		printk("ub: cmd #%d start failed (%d)\n", cmd->tag, rc); /* P3 */
-		del_timer(&sc->work_timer);
 		ub_complete(&sc->work_done);
 		return rc;
 	}
 
+	sc->work_timer.expires = jiffies + UB_URB_TIMEOUT;
+	add_timer(&sc->work_timer);
+
 	cmd->state = UB_CMDST_CMD;
 	ub_cmdtr_state(sc, cmd);
 	return 0;
@@ -836,6 +852,7 @@ static void ub_scsi_action(unsigned long _dev)
 	unsigned long flags;
 
 	spin_lock_irqsave(&sc->lock, flags);
+	del_timer(&sc->work_timer);
 	ub_scsi_dispatch(sc);
 	spin_unlock_irqrestore(&sc->lock, flags);
 }
@@ -968,18 +985,17 @@ static void ub_scsi_urb_compl(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 		sc->work_urb.error_count = 0;
 		sc->work_urb.status = 0;
 
-		sc->work_timer.expires = jiffies + UB_URB_TIMEOUT;
-		add_timer(&sc->work_timer);
-
 		if ((rc = usb_submit_urb(&sc->work_urb, GFP_ATOMIC)) != 0) {
 			/* XXX Clear stalls */
 			printk("ub: data #%d submit failed (%d)\n", cmd->tag, rc); /* P3 */
-			del_timer(&sc->work_timer);
 			ub_complete(&sc->work_done);
 			ub_state_done(sc, cmd, rc);
 			return;
 		}
 
+		sc->work_timer.expires = jiffies + UB_URB_TIMEOUT;
+		add_timer(&sc->work_timer);
+
 		cmd->state = UB_CMDST_DATA;
 		ub_cmdtr_state(sc, cmd);
 
@@ -1063,19 +1079,18 @@ static void ub_scsi_urb_compl(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 			sc->work_urb.error_count = 0;
 			sc->work_urb.status = 0;
 
-			sc->work_timer.expires = jiffies + UB_URB_TIMEOUT;
-			add_timer(&sc->work_timer);
-
 			rc = usb_submit_urb(&sc->work_urb, GFP_ATOMIC);
 			if (rc != 0) {
 				/* XXX Clear stalls */
 				printk("%s: CSW #%d submit failed (%d)\n",
 				   sc->name, cmd->tag, rc); /* P3 */
-				del_timer(&sc->work_timer);
 				ub_complete(&sc->work_done);
 				ub_state_done(sc, cmd, rc);
 				return;
 			}
+
+			sc->work_timer.expires = jiffies + UB_URB_TIMEOUT;
+			add_timer(&sc->work_timer);
 			return;
 		}
 
@@ -1132,16 +1147,8 @@ static void ub_scsi_urb_compl(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 		(*cmd->done)(sc, cmd);
 
 	} else if (cmd->state == UB_CMDST_SENSE) {
-		/* 
-		 * We do not look at sense, because even if there was no sense,
-		 * we get into UB_CMDST_SENSE from a STALL or CSW FAIL only.
-		 * We request sense because we want to clear CHECK CONDITION
-		 * on devices with delusions of SCSI, and not because we
-		 * are curious in any way about the sense itself.
-		 */
-		/* if ((cmd->top_sense[2] & 0x0F) == NO_SENSE) { foo } */
-
 		ub_state_done(sc, cmd, -EIO);
+
 	} else {
 		printk(KERN_WARNING "%s: "
 		    "wrong command state %d on device %u\n",
@@ -1186,18 +1193,17 @@ static void ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 	sc->work_urb.error_count = 0;
 	sc->work_urb.status = 0;
 
-	sc->work_timer.expires = jiffies + UB_URB_TIMEOUT;
-	add_timer(&sc->work_timer);
-
 	if ((rc = usb_submit_urb(&sc->work_urb, GFP_ATOMIC)) != 0) {
 		/* XXX Clear stalls */
 		printk("ub: CSW #%d submit failed (%d)\n", cmd->tag, rc); /* P3 */
-		del_timer(&sc->work_timer);
 		ub_complete(&sc->work_done);
 		ub_state_done(sc, cmd, rc);
 		return;
 	}
 
+	sc->work_timer.expires = jiffies + UB_URB_TIMEOUT;
+	add_timer(&sc->work_timer);
+
 	cmd->stat_count = 0;
 	cmd->state = UB_CMDST_STAT;
 	ub_cmdtr_state(sc, cmd);
@@ -1217,9 +1223,17 @@ static void ub_state_sense(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 		goto error;
 	}
 
+	/*
+	 * ``If the allocation length is eighteen or greater, and a device
+	 * server returns less than eithteen bytes of data, the application
+	 * client should assume that the bytes not transferred would have been
+	 * zeroes had the device server returned those bytes.''
+	 */
 	memset(&sc->top_sense, 0, UB_SENSE_SIZE);
+
 	scmd = &sc->top_rqs_cmd;
 	scmd->cdb[0] = REQUEST_SENSE;
+	scmd->cdb[4] = UB_SENSE_SIZE;
 	scmd->cdb_len = 6;
 	scmd->dir = UB_DIR_READ;
 	scmd->state = UB_CMDST_INIT;
@@ -1271,14 +1285,13 @@ static int ub_submit_clear_stall(struct ub_dev *sc, struct ub_scsi_cmd *cmd,
 	sc->work_urb.error_count = 0;
 	sc->work_urb.status = 0;
 
-	sc->work_timer.expires = jiffies + UB_CTRL_TIMEOUT;
-	add_timer(&sc->work_timer);
-
 	if ((rc = usb_submit_urb(&sc->work_urb, GFP_ATOMIC)) != 0) {
-		del_timer(&sc->work_timer);
 		ub_complete(&sc->work_done);
 		return rc;
 	}
+
+	sc->work_timer.expires = jiffies + UB_CTRL_TIMEOUT;
+	add_timer(&sc->work_timer);
 	return 0;
 }
 
@@ -1289,8 +1302,15 @@ static void ub_top_sense_done(struct ub_dev *sc, struct ub_scsi_cmd *scmd)
 	unsigned char *sense = scmd->data;
 	struct ub_scsi_cmd *cmd;
 
+	/*
+	 * Ignoring scmd->act_len, because the buffer was pre-zeroed.
+	 */
 	ub_cmdtr_sense(sc, scmd, sense);
 
+	/*
+	 * Find the command which triggered the unit attention or a check,
+	 * save the sense into it, and advance its state machine.
+	 */
 	if ((cmd = ub_cmdq_peek(sc)) == NULL) {
 		printk(KERN_WARNING "%s: sense done while idle\n", sc->name);
 		return;
@@ -1308,6 +1328,10 @@ static void ub_top_sense_done(struct ub_dev *sc, struct ub_scsi_cmd *scmd)
 		return;
 	}
 
+	cmd->key = sense[2] & 0x0F;
+	cmd->asc = sense[12];
+	cmd->ascq = sense[13];
+
 	ub_scsi_urb_compl(sc, cmd);
 }
 
@@ -1407,7 +1431,15 @@ static int ub_bd_open(struct inode *inode, struct file *filp)
 	if (sc->removable || sc->readonly)
 		check_disk_change(inode->i_bdev);
 
-	/* XXX sd.c and floppy.c bail on open if media is not present. */
+	/*
+	 * The sd.c considers ->media_present and ->changed not equivalent,
+	 * under some pretty murky conditions (a failure of READ CAPACITY).
+	 * We may need it one day.
+	 */
+	if (sc->removable && sc->changed && !(filp->f_flags & O_NDELAY)) {
+		rc = -ENOMEDIUM;
+		goto err_open;
+	}
 
 	if (sc->readonly && (filp->f_mode & FMODE_WRITE)) {
 		rc = -EROFS;
@@ -1492,8 +1524,11 @@ static int ub_bd_revalidate(struct gendisk *disk)
 	printk(KERN_INFO "%s: device %u capacity nsec %ld bsize %u\n",
 	    sc->name, sc->dev->devnum, sc->capacity.nsec, sc->capacity.bsize);
 
+	/* XXX Support sector size switching like in sr.c */
+	blk_queue_hardsect_size(disk->queue, sc->capacity.bsize);
 	set_capacity(disk, sc->capacity.nsec);
 	// set_disk_ro(sdkp->disk, sc->readonly);
+
 	return 0;
 }
 
@@ -1592,6 +1627,9 @@ static int ub_sync_tur(struct ub_dev *sc)
 
 	rc = cmd->error;
 
+	if (rc == -EIO && cmd->key != 0)	/* Retries for benh's key */
+		rc = cmd->key;
+
 err_submit:
 	kfree(cmd);
 err_alloc:
@@ -1654,8 +1692,8 @@ static int ub_sync_read_cap(struct ub_dev *sc, struct ub_capacity *ret)
 	}
 
 	/* sd.c special-cases sector size of 0 to mean 512. Needed? Safe? */
-	nsec = be32_to_cpu(*(u32 *)p) + 1;
-	bsize = be32_to_cpu(*(u32 *)(p + 4));
+	nsec = be32_to_cpu(*(__be32 *)p) + 1;
+	bsize = be32_to_cpu(*(__be32 *)(p + 4));
 	switch (bsize) {
 	case 512:	shift = 0;	break;
 	case 1024:	shift = 1;	break;
@@ -1725,28 +1763,22 @@ static int ub_probe_clear_stall(struct ub_dev *sc, int stalled_pipe)
 	sc->work_urb.error_count = 0;
 	sc->work_urb.status = 0;
 
-	init_timer(&timer);
-	timer.function = ub_probe_timeout;
-	timer.data = (unsigned long) &compl;
-	timer.expires = jiffies + UB_CTRL_TIMEOUT;
-	add_timer(&timer);
-
 	if ((rc = usb_submit_urb(&sc->work_urb, GFP_KERNEL)) != 0) {
 		printk(KERN_WARNING
 		     "%s: Unable to submit a probe clear (%d)\n", sc->name, rc);
-		del_timer_sync(&timer);
 		return rc;
 	}
 
+	init_timer(&timer);
+	timer.function = ub_probe_timeout;
+	timer.data = (unsigned long) &compl;
+	timer.expires = jiffies + UB_CTRL_TIMEOUT;
+	add_timer(&timer);
+
 	wait_for_completion(&compl);
 
 	del_timer_sync(&timer);
-	/*
-	 * Most of the time, URB was done and dev set to NULL, and so
-	 * the unlink bounces out with ENODEV. We do not call usb_kill_urb
-	 * because we still think about a backport to 2.4.
-	 */
-	usb_unlink_urb(&sc->work_urb);
+	usb_kill_urb(&sc->work_urb);
 
 	/* reset the endpoint toggle */
 	usb_settoggle(sc->dev, endp, usb_pipeout(sc->last_pipe), 0);
@@ -1813,6 +1845,7 @@ static int ub_probe(struct usb_interface *intf,
 	request_queue_t *q;
 	struct gendisk *disk;
 	int rc;
+	int i;
 
 	rc = -ENOMEM;
 	if ((sc = kmalloc(sizeof(struct ub_dev), GFP_KERNEL)) == NULL)
@@ -1879,7 +1912,11 @@ static int ub_probe(struct usb_interface *intf,
 	 * has to succeed, so we clear checks with an additional one here.
 	 * In any case it's not our business how revaliadation is implemented.
 	 */
-	ub_sync_tur(sc);
+	for (i = 0; i < 3; i++) {	/* Retries for benh's key */
+		if ((rc = ub_sync_tur(sc)) <= 0) break;
+		if (rc != 0x6) break;
+		msleep(10);
+	}
 
 	sc->removable = 1;		/* XXX Query this from the device */
 
@@ -1915,7 +1952,7 @@ static int ub_probe(struct usb_interface *intf,
 	blk_queue_max_phys_segments(q, UB_MAX_REQ_SG);
 	// blk_queue_segment_boundary(q, CARM_SG_BOUNDARY);
 	blk_queue_max_sectors(q, UB_MAX_SECTORS);
-	// blk_queue_hardsect_size(q, xxxxx);
+	blk_queue_hardsect_size(q, sc->capacity.bsize);
 
 	/*
 	 * This is a serious infraction, caused by a deficiency in the
@@ -2006,17 +2043,6 @@ static void ub_disconnect(struct usb_interface *intf)
 		blk_cleanup_queue(q);
 
 	/*
-	 * If we zero disk->private_data BEFORE put_disk, we have to check
-	 * for NULL all over the place in open, release, check_media and
-	 * revalidate, because the block level semaphore is well inside the
-	 * put_disk. But we cannot zero after the call, because *disk is gone.
-	 * The sd.c is blatantly racy in this area.
-	 */
-	/* disk->private_data = NULL; */
-	put_disk(disk);
-	sc->disk = NULL;
-
-	/*
 	 * We really expect blk_cleanup_queue() to wait, so no amount
 	 * of paranoya is too much.
 	 *
@@ -2035,6 +2061,13 @@ static void ub_disconnect(struct usb_interface *intf)
 	spin_unlock_irqrestore(&sc->lock, flags);
 
 	/*
+	 * There is virtually no chance that other CPU runs times so long
+	 * after ub_urb_complete should have called del_timer, but only if HCD
+	 * didn't forget to deliver a callback on unlink.
+	 */
+	del_timer_sync(&sc->work_timer);
+
+	/*
 	 * At this point there must be no commands coming from anyone
 	 * and no URBs left in transit.
 	 */
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2004-10-18 20:50:22 -0700
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2004-10-18 20:50:22 -0700
commit	098fc560ef2bbd1bde80845c898fa95db616eb6c (patch)
tree	ca722c6fdbdffe9b7cfd31d61e8f4aae906a319c /drivers/block
parent	bffe01870598b7a0a77073e25ee94e026bc98e6b (diff)
parent	2a136606fe21b603a0ce484fc578f862f8e8384d (diff)